Ejemplo n.º 1
0
def load_raw_data(assets,
                  dates,
                  data_query_time,
                  data_query_tz,
                  expr,
                  odo_kwargs,
                  checkpoints=None):
    """
    Given an expression representing data to load, perform normalization and
    forward-filling and return the data, materialized. Only accepts data with a
    `sid` field.

    Parameters
    ----------
    assets : pd.int64index
        the assets to load data for.
    dates : pd.datetimeindex
        the simulation dates to load data for.
    data_query_time : datetime.time
        the time used as cutoff for new information.
    data_query_tz : tzinfo
        the timezone to normalize your dates to before comparing against
        `time`.
    expr : expr
        the expression representing the data to load.
    odo_kwargs : dict
        extra keyword arguments to pass to odo when executing the expression.
    checkpoints : expr, optional
        the expression representing the checkpointed data for `expr`.

    Returns
    -------
    raw : pd.dataframe
        The result of computing expr and materializing the result as a
        dataframe.
    """
    lower_dt, upper_dt = normalize_data_query_bounds(
        dates[0],
        dates[-1],
        data_query_time,
        data_query_tz,
    )
    raw = ffill_query_in_range(
        expr,
        lower_dt,
        upper_dt,
        checkpoints=checkpoints,
        odo_kwargs=odo_kwargs,
    )
    sids = raw[SID_FIELD_NAME]
    raw.drop(sids[~sids.isin(assets)].index, inplace=True)
    if data_query_time is not None:
        normalize_timestamp_to_query_time(
            raw,
            data_query_time,
            data_query_tz,
            inplace=True,
            ts_field=TS_FIELD_NAME,
        )
    return raw
Ejemplo n.º 2
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        raw = ffill_query_in_range(
            self._expr,
            lower_dt,
            upper_dt,
            self._odo_kwargs,
        )
        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(
            sids[~sids.isin(assets)].index,
            inplace=True
        )
        if data_query_time is not None:
            normalize_timestamp_to_query_time(
                raw,
                data_query_time,
                data_query_tz,
                inplace=True,
                ts_field=TS_FIELD_NAME,
            )
        gb = raw.groupby(SID_FIELD_NAME)
        return self.concrete_loader(
            dates,
            self.prepare_data(raw, gb),
            dataset=self._dataset,
        ).load_adjusted_array(columns, dates, assets, mask)
Ejemplo n.º 3
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        raw = ffill_query_in_range(
            self._expr,
            lower_dt,
            upper_dt,
            self._odo_kwargs,
        )
        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(
            sids[~sids.isin(assets)].index,
            inplace=True
        )
        if data_query_time is not None:
            normalize_timestamp_to_query_time(
                raw,
                data_query_time,
                data_query_tz,
                inplace=True,
                ts_field=TS_FIELD_NAME,
            )

        gb = raw.groupby(SID_FIELD_NAME)

        def mkseries(idx, raw_loc=raw.loc):
            vs = raw_loc[
                idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME]
            ].values
            return pd.Series(
                index=pd.DatetimeIndex(vs[:, 0]),
                data=vs[:, 1],
            )

        return EarningsCalendarLoader(
            dates,
            valmap(mkseries, gb.groups),
            dataset=self._dataset,
        ).load_adjusted_array(columns, dates, assets, mask)
Ejemplo n.º 4
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        raw = ffill_query_in_range(
            self._expr,
            lower_dt,
            upper_dt,
            self._odo_kwargs,
        )
        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(
            sids[~sids.isin(assets)].index,
            inplace=True
        )
        if data_query_time is not None:
            normalize_timestamp_to_query_time(
                raw,
                data_query_time,
                data_query_tz,
                inplace=True,
                ts_field=TS_FIELD_NAME,
            )

        return EventsLoader(
            events=raw,
            next_value_columns=self._next_value_columns,
            previous_value_columns=self._previous_value_columns,
        ).load_adjusted_array(
            columns,
            dates,
            assets,
            mask,
        )
Ejemplo n.º 5
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        raw = ffill_query_in_range(
            self._expr,
            lower_dt,
            upper_dt,
            self._odo_kwargs,
        )
        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(sids[~sids.isin(assets)].index, inplace=True)
        if data_query_time is not None:
            normalize_timestamp_to_query_time(
                raw,
                data_query_time,
                data_query_tz,
                inplace=True,
                ts_field=TS_FIELD_NAME,
            )

        return EventsLoader(
            events=raw,
            next_value_columns=self._next_value_columns,
            previous_value_columns=self._previous_value_columns,
        ).load_adjusted_array(
            columns,
            dates,
            assets,
            mask,
        )
Ejemplo n.º 6
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs = self[dataset]
        have_sids = (dataset.ndim == 2)
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )
        colnames = added_query_fields + list(map(getname, columns))

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        if checkpoints is not None:
            ts = checkpoints[TS_FIELD_NAME]
            checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp)
            if pd.isnull(checkpoints_ts):
                materialized_checkpoints = pd.DataFrame(columns=colnames)
                lower = None
            else:
                materialized_checkpoints = odo(
                    checkpoints[ts == checkpoints_ts][colnames],
                    pd.DataFrame,
                    **odo_kwargs
                )
                lower = checkpoints_ts
        else:
            materialized_checkpoints = pd.DataFrame(columns=colnames)
            lower = None

        materialized_expr = collect_expr(expr, lower)
        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr,
                ),
                ignore_index=True,
                copy=False,
            )
        materialized_deltas = (
            collect_expr(deltas, lower)
            if deltas is not None else
            pd.DataFrame(columns=colnames)
        )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)
            ]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[
                    :, TS_FIELD_NAME
                ].astype('datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        def last_in_date_group(df, reindex, have_sids=have_sids):
            idx = dates[dates.searchsorted(
                df[TS_FIELD_NAME].values.astype('datetime64[D]')
            )]
            if have_sids:
                idx = [idx, SID_FIELD_NAME]

            last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby(
                idx,
                sort=False,
            ).last()

            if have_sids:
                last_in_group = last_in_group.unstack()

            if reindex:
                if have_sids:
                    cols = last_in_group.columns
                    last_in_group = last_in_group.reindex(
                        index=dates,
                        columns=pd.MultiIndex.from_product(
                            (cols.levels[0], assets),
                            names=cols.names,
                        ),
                    )
                else:
                    last_in_group = last_in_group.reindex(dates)

            return last_in_group

        sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
        dense_output = last_in_date_group(sparse_output, reindex=True)
        dense_output.ffill(inplace=True)

        # Fill in missing values specified by each column. This is made
        # significantly more complex by the fact that we need to work around
        # two pandas issues:

        # 1) When we have sids, if there are no records for a given sid for any
        #    dates, pandas will generate a column full of NaNs for that sid.
        #    This means that some of the columns in `dense_output` are now
        #    float instead of the intended dtype, so we have to coerce back to
        #    our expected type and convert NaNs into the desired missing value.

        # 2) DataFrame.ffill assumes that receiving None as a fill-value means
        #    that no value was passed.  Consequently, there's no way to tell
        #    pandas to replace NaNs in an object column with None using fillna,
        #    so we have to roll our own instead using df.where.
        for column in columns:
            # Special logic for strings since `fillna` doesn't work if the
            # missing value is `None`.
            if column.dtype == categorical_dtype:
                dense_output[column.name] = dense_output[
                    column.name
                ].where(pd.notnull(dense_output[column.name]),
                        column.missing_value)
            else:
                # We need to execute `fillna` before `astype` in case the
                # column contains NaNs and needs to be cast to bool or int.
                # This is so that the NaNs are replaced first, since pandas
                # can't convert NaNs for those types.
                dense_output[column.name] = dense_output[
                    column.name
                ].fillna(column.missing_value).astype(column.dtype)

        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # If we do not have sids, use the column view to make a single
            # column vector which is unassociated with any assets.
            column_view = op.itemgetter(np.s_[:, np.newaxis])

            adjustments_from_deltas = adjustments_from_deltas_no_sids
            mask = np.full(
                shape=(len(mask), 1), fill_value=True, dtype=bool_dtype,
            )

        for column_idx, column in enumerate(columns):
            column_name = column.name
            yield column, AdjustedArray(
                column_view(
                    dense_output[column_name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column_name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
Ejemplo n.º 7
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, odo_kwargs = self[dataset]
        have_sids = SID_FIELD_NAME in expr.fields
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def where(e, column):
            """Create the query to run against the resources.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            column : BoundColumn
                The column to query for.

            Returns
            -------
            q : Expr
                The query to run for the given column.
            """
            colname = column.name
            pred = e[TS_FIELD_NAME] <= lower_dt
            schema = e[colname].schema.measure
            if isinstance(schema, Option):
                pred &= e[colname].notnull()
                schema = schema.ty
            if schema in floating:
                pred &= ~e[colname].isnan()
            filtered = e[pred]
            lower = filtered.timestamp.max()

            if have_sids:
                # If we have sids, then we need to take the earliest of the
                # greatest date that has a non-null value by sid.
                lower = bz.by(
                    filtered[SID_FIELD_NAME],
                    timestamp=lower,
                ).timestamp.min()

            lower = odo(lower, pd.Timestamp)
            if lower is pd.NaT:
                # If there is no lower date, just query for data in he date
                # range. It must all be null anyways.
                lower = lower_dt

            return e[
                (e[TS_FIELD_NAME] >= lower) &
                (e[TS_FIELD_NAME] <= upper_dt)
            ][added_query_fields + [colname]]

        def collect_expr(e):
            """Execute and merge all of the per-column subqueries.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            return sort_values(reduce(
                partial(pd.merge, on=added_query_fields, how='outer'),
                (
                    odo(where(e, column), pd.DataFrame, **odo_kwargs)
                    for column in columns
                ),
            ), TS_FIELD_NAME)  # sort for the groupby later

        materialized_expr = collect_expr(expr)
        materialized_deltas = (
            collect_expr(deltas)
            if deltas is not None else
            pd.DataFrame(
                columns=added_query_fields + list(map(getname, columns)),
            )
        )

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[
                    :, TS_FIELD_NAME
                ].astype('datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        def last_in_date_group(df, reindex, have_sids=have_sids):
            idx = dates[dates.searchsorted(
                df[TS_FIELD_NAME].values.astype('datetime64[D]')
            )]
            if have_sids:
                idx = [idx, SID_FIELD_NAME]

            last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby(
                idx,
                sort=False,
            ).last()

            if have_sids:
                last_in_group = last_in_group.unstack()

            if reindex:
                if have_sids:
                    cols = last_in_group.columns
                    last_in_group = last_in_group.reindex(
                        index=dates,
                        columns=pd.MultiIndex.from_product(
                            (cols.levels[0], assets),
                            names=cols.names,
                        ),
                    )
                else:
                    last_in_group = last_in_group.reindex(dates)

            return last_in_group

        sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
        dense_output = last_in_date_group(sparse_output, reindex=True)
        dense_output.ffill(inplace=True)

        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # We use the column view to make an array per asset.
            column_view = compose(
                # We need to copy this because we need a concrete ndarray.
                # The `repeat_last_axis` call will give us a fancy strided
                # array which uses a buffer to represent `len(assets)` columns.
                # The engine puts nans at the indicies for which we do not have
                # sid information so that the nan-aware reductions still work.
                # A future change to the engine would be to add first class
                # support for macro econimic datasets.
                copy,
                partial(repeat_last_axis, count=len(assets)),
            )
            adjustments_from_deltas = adjustments_from_deltas_no_sids

        for column_idx, column in enumerate(columns):
            column_name = column.name
            yield column, AdjustedArray(
                column_view(
                    dense_output[column_name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column_name,
                    asset_idx,
                    sparse_deltas,
                )
            )
Ejemplo n.º 8
0
Archivo: core.py Proyecto: yu68/zipline
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset, ) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs, apply_deltas_adjustments = self[
            dataset]
        have_sids = (dataset.ndim == 2)
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME
                              } | ({SID_FIELD_NAME} if have_sids else set())
        requested_columns = set(map(getname, columns))
        colnames = sorted(added_query_fields | requested_columns)

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs)

        materialized_expr = self.pool.apply_async(collect_expr, (expr, lower))
        materialized_deltas = (self.pool.apply(collect_expr,
                                               (deltas, lower)) if deltas
                               is not None else pd.DataFrame(columns=colnames))

        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr.get(),
                ),
                ignore_index=True,
                copy=False,
            )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype(
                    'datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        if AD_FIELD_NAME not in requested_columns:
            sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        sparse_deltas = last_in_date_group(non_novel_deltas,
                                           dates,
                                           assets,
                                           reindex=False,
                                           have_sids=have_sids)
        dense_output = last_in_date_group(sparse_output,
                                          dates,
                                          assets,
                                          reindex=True,
                                          have_sids=have_sids)
        ffill_across_cols(dense_output, columns,
                          {c.name: c.name
                           for c in columns})

        # By default, no non-novel deltas are applied.
        def no_adjustments_from_deltas(*args):
            return {}

        adjustments_from_deltas = no_adjustments_from_deltas
        if have_sids:
            if apply_deltas_adjustments:
                adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # If we do not have sids, use the column view to make a single
            # column vector which is unassociated with any assets.
            column_view = op.itemgetter(np.s_[:, np.newaxis])
            if apply_deltas_adjustments:
                adjustments_from_deltas = adjustments_from_deltas_no_sids
            mask = np.full(
                shape=(len(mask), 1),
                fill_value=True,
                dtype=bool_dtype,
            )

        return {
            column: AdjustedArray(
                column_view(
                    dense_output[column.name].values.astype(column.dtype), ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column.name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
            for column_idx, column in enumerate(columns)
        }
Ejemplo n.º 9
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset, ) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs = self[dataset]
        have_sids = SID_FIELD_NAME in expr.fields
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME
                              ] + ([SID_FIELD_NAME] if have_sids else [])
        colnames = added_query_fields + list(map(getname, columns))

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        if checkpoints is not None:
            ts = checkpoints[TS_FIELD_NAME]
            checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp)
            if pd.isnull(checkpoints_ts):
                materialized_checkpoints = pd.DataFrame(columns=colnames)
                lower = None
            else:
                materialized_checkpoints = odo(
                    checkpoints[ts == checkpoints_ts][colnames], pd.DataFrame,
                    **odo_kwargs)
                lower = checkpoints_ts
        else:
            materialized_checkpoints = pd.DataFrame(columns=colnames)
            lower = None

        materialized_expr = collect_expr(expr, lower)
        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr,
                ),
                ignore_index=True,
                copy=False,
            )
        materialized_deltas = (collect_expr(deltas, lower) if deltas
                               is not None else pd.DataFrame(columns=colnames))

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype(
                    'datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        def last_in_date_group(df, reindex, have_sids=have_sids):
            idx = dates[dates.searchsorted(
                df[TS_FIELD_NAME].values.astype('datetime64[D]'))]
            if have_sids:
                idx = [idx, SID_FIELD_NAME]

            last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby(
                idx,
                sort=False,
            ).last()

            if have_sids:
                last_in_group = last_in_group.unstack()

            if reindex:
                if have_sids:
                    cols = last_in_group.columns
                    last_in_group = last_in_group.reindex(
                        index=dates,
                        columns=pd.MultiIndex.from_product(
                            (cols.levels[0], assets),
                            names=cols.names,
                        ),
                    )
                else:
                    last_in_group = last_in_group.reindex(dates)

            return last_in_group

        sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
        dense_output = last_in_date_group(sparse_output, reindex=True)
        dense_output.ffill(inplace=True)

        # Fill in missing values specified by each column. This is made
        # significantly more complex by the fact that we need to work around
        # two pandas issues:

        # 1) When we have sids, if there are no records for a given sid for any
        #    dates, pandas will generate a column full of NaNs for that sid.
        #    This means that some of the columns in `dense_output` are now
        #    float instead of the intended dtype, so we have to coerce back to
        #    our expected type and convert NaNs into the desired missing value.

        # 2) DataFrame.ffill assumes that receiving None as a fill-value means
        #    that no value was passed.  Consequently, there's no way to tell
        #    pandas to replace NaNs in an object column with None using fillna,
        #    so we have to roll our own instead using df.where.
        for column in columns:
            # Special logic for strings since `fillna` doesn't work if the
            # missing value is `None`.
            if column.dtype == categorical_dtype:
                dense_output[column.name] = dense_output[column.name].where(
                    pd.notnull(dense_output[column.name]),
                    column.missing_value)
            else:
                # We need to execute `fillna` before `astype` in case the
                # column contains NaNs and needs to be cast to bool or int.
                # This is so that the NaNs are replaced first, since pandas
                # can't convert NaNs for those types.
                dense_output[column.name] = dense_output[column.name].fillna(
                    column.missing_value).astype(column.dtype)

        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # We use the column view to make an array per asset.
            column_view = compose(
                # We need to copy this because we need a concrete ndarray.
                # The `repeat_last_axis` call will give us a fancy strided
                # array which uses a buffer to represent `len(assets)` columns.
                # The engine puts nans at the indicies for which we do not have
                # sid information so that the nan-aware reductions still work.
                # A future change to the engine would be to add first class
                # support for macro econimic datasets.
                copy,
                partial(repeat_last_axis, count=len(assets)),
            )
            adjustments_from_deltas = adjustments_from_deltas_no_sids

        for column_idx, column in enumerate(columns):
            column_name = column.name
            yield column, AdjustedArray(
                column_view(
                    dense_output[column_name].values.astype(column.dtype), ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column_name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
Ejemplo n.º 10
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (expr_data, ) = {self._table_expressions[c] for c in columns}
        except ValueError:
            raise AssertionError(
                'all columns must share the same expression data', )

        expr, deltas, checkpoints, odo_kwargs = expr_data

        have_sids = (first(columns).dataset.ndim == 2)
        added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME
                              } | ({SID_FIELD_NAME} if have_sids else set())
        requested_columns = set(map(getname, columns))
        colnames = sorted(added_query_fields | requested_columns)

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] < upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs)

        materialized_expr_deferred = self.pool.apply_async(
            collect_expr,
            (expr, lower),
        )
        materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower))
                               if deltas is not None else None)

        all_rows = pd.concat(
            filter(
                lambda df: df is not None,
                (
                    materialized_checkpoints,
                    materialized_expr_deferred.get(),
                    materialized_deltas,
                ),
            ),
            ignore_index=True,
            copy=False,
        )

        all_rows[TS_FIELD_NAME] = all_rows[TS_FIELD_NAME].astype(
            'datetime64[ns]', )
        all_rows.sort_values([TS_FIELD_NAME, AD_FIELD_NAME], inplace=True)

        if have_sids:
            return adjusted_arrays_from_rows_with_assets(
                dates,
                data_query_time,
                data_query_tz,
                assets,
                mask,
                columns,
                all_rows,
            )
        else:
            return adjusted_arrays_from_rows_without_assets(
                dates,
                data_query_time,
                data_query_tz,
                None,
                columns,
                all_rows,
            )
Ejemplo n.º 11
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, odo_kwargs = self[dataset]
        have_sids = SID_FIELD_NAME in expr.fields
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def where(e):
            """Create the query to run against the resources.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.

            Returns
            -------
            q : Expr
                The query to run.
            """
            def lower_for_col(column):
                pred = e[TS_FIELD_NAME] <= lower_dt
                colname = column.name
                schema = e[colname].schema.measure
                if isinstance(schema, Option):
                    pred &= e[colname].notnull()
                    schema = schema.ty
                if schema in floating:
                    pred &= ~e[colname].isnan()

                filtered = e[pred]
                lower = filtered[TS_FIELD_NAME].max()
                if have_sids:
                    # If we have sids, then we need to take the earliest of the
                    # greatest date that has a non-null value by sid.
                    lower = bz.by(
                        filtered[SID_FIELD_NAME],
                        timestamp=lower,
                    ).timestamp.min()
                return lower

            lower = odo(
                reduce(
                    bz.least,
                    map(lower_for_col, columns),
                ),
                pd.Timestamp,
                **odo_kwargs
            )
            if lower is pd.NaT:
                lower = lower_dt
            return e[
                (e[TS_FIELD_NAME] >= lower) &
                (e[TS_FIELD_NAME] <= upper_dt)
            ][added_query_fields + list(map(getname, columns))]

        def collect_expr(e):
            """Execute and merge all of the per-column subqueries.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            df = odo(where(e), pd.DataFrame, **odo_kwargs)
            df.sort(TS_FIELD_NAME, inplace=True)  # sort for the groupby later
            return df

        materialized_expr = collect_expr(expr)
        materialized_deltas = (
            collect_expr(deltas)
            if deltas is not None else
            pd.DataFrame(
                columns=added_query_fields + list(map(getname, columns)),
            )
        )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)
            ]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[
                    :, TS_FIELD_NAME
                ].astype('datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        def last_in_date_group(df, reindex, have_sids=have_sids):
            idx = dates[dates.searchsorted(
                df[TS_FIELD_NAME].values.astype('datetime64[D]')
            )]
            if have_sids:
                idx = [idx, SID_FIELD_NAME]

            last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby(
                idx,
                sort=False,
            ).last()

            if have_sids:
                last_in_group = last_in_group.unstack()

            if reindex:
                if have_sids:
                    cols = last_in_group.columns
                    last_in_group = last_in_group.reindex(
                        index=dates,
                        columns=pd.MultiIndex.from_product(
                            (cols.levels[0], assets),
                            names=cols.names,
                        ),
                    )
                else:
                    last_in_group = last_in_group.reindex(dates)

            return last_in_group

        sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
        dense_output = last_in_date_group(sparse_output, reindex=True)
        dense_output.ffill(inplace=True)

        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # We use the column view to make an array per asset.
            column_view = compose(
                # We need to copy this because we need a concrete ndarray.
                # The `repeat_last_axis` call will give us a fancy strided
                # array which uses a buffer to represent `len(assets)` columns.
                # The engine puts nans at the indicies for which we do not have
                # sid information so that the nan-aware reductions still work.
                # A future change to the engine would be to add first class
                # support for macro econimic datasets.
                copy,
                partial(repeat_last_axis, count=len(assets)),
            )
            adjustments_from_deltas = adjustments_from_deltas_no_sids

        for column_idx, column in enumerate(columns):
            column_name = column.name
            yield column, AdjustedArray(
                column_view(
                    dense_output[column_name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column_name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
Ejemplo n.º 12
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, resources = self[dataset]
        have_sids = SID_FIELD_NAME in expr.fields
        assets = list(map(int, assets))  # coerce from numpy.int64
        fields = list(map(dataset_name, columns))
        query_fields = fields + [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def where(e):
            """Create the query to run against the resources.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.

            Returns
            -------
            q : Expr
                The query to run.
            """
            ts = e[TS_FIELD_NAME]
            # Hack to get the lower bound to query:
            # This must be strictly executed because the data for `ts` will
            # be removed from scope too early otherwise.
            lower = odo(ts[ts <= lower_dt].max(), pd.Timestamp)
            selection = ts <= upper_dt
            if have_sids:
                selection &= e[SID_FIELD_NAME].isin(assets)
            if lower is not pd.NaT:
                selection &= ts >= lower

            return e[selection][query_fields]

        extra_kwargs = {'d': resources} if resources else {}
        materialized_expr = odo(where(expr), pd.DataFrame, **extra_kwargs)
        materialized_deltas = (
            odo(where(deltas), pd.DataFrame, **extra_kwargs)
            if deltas is not None else
            pd.DataFrame(columns=query_fields)
        )

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[
                    :, TS_FIELD_NAME
                ].astype('datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        if have_sids:
            # Unstack by the sid so that we get a multi-index on the columns
            # of datacolumn, sid.
            sparse_output = sparse_output.set_index(
                [TS_FIELD_NAME, SID_FIELD_NAME],
            ).unstack()
            sparse_deltas = non_novel_deltas.set_index(
                [TS_FIELD_NAME, SID_FIELD_NAME],
            ).unstack()

            dense_output = sparse_output.reindex(dates, method='ffill')
            cols = dense_output.columns
            dense_output = dense_output.reindex(
                columns=pd.MultiIndex.from_product(
                    (cols.levels[0], assets),
                    names=cols.names,
                ),
            )

            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # We use the column view to make an array per asset.
            column_view = compose(
                # We need to copy this because we need a concrete ndarray.
                # The `repeat_last_axis` call will give us a fancy strided
                # array which uses a buffer to represent `len(assets)` columns.
                # The engine puts nans at the indicies for which we do not have
                # sid information so that the nan-aware reductions still work.
                # A future change to the engine would be to add first class
                # support for macro econimic datasets.
                copy,
                partial(repeat_last_axis, count=len(assets)),
            )
            sparse_output = sparse_output.set_index(TS_FIELD_NAME)
            dense_output = sparse_output.reindex(dates, method='ffill')
            sparse_deltas = non_novel_deltas.set_index(TS_FIELD_NAME)
            adjustments_from_deltas = adjustments_from_deltas_no_sids

        for column_idx, column in enumerate(columns):
            column_name = column.name
            yield column, AdjustedArray(
                column_view(
                    dense_output[column_name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output.index,
                    column_idx,
                    column_name,
                    assets,
                    sparse_deltas,
                )
            )
Ejemplo n.º 13
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs = self[dataset]
        have_sids = (dataset.ndim == 2)
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )
        colnames = added_query_fields + list(map(getname, columns))

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs
        )

        materialized_expr = self.pool.apply_async(collect_expr, (expr, lower))
        materialized_deltas = (
            self.pool.apply(collect_expr, (deltas, lower))
            if deltas is not None else
            pd.DataFrame(columns=colnames)
        )

        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr.get(),
                ),
                ignore_index=True,
                copy=False,
            )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)
            ]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[
                    :, TS_FIELD_NAME
                ].astype('datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        sparse_deltas = last_in_date_group(non_novel_deltas,
                                           dates,
                                           assets,
                                           reindex=False,
                                           have_sids=have_sids)
        dense_output = last_in_date_group(sparse_output,
                                          dates,
                                          assets,
                                          reindex=True,
                                          have_sids=have_sids)
        ffill_across_cols(dense_output, columns, {c.name: c.name
                                                  for c in columns})
        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # If we do not have sids, use the column view to make a single
            # column vector which is unassociated with any assets.
            column_view = op.itemgetter(np.s_[:, np.newaxis])

            adjustments_from_deltas = adjustments_from_deltas_no_sids
            mask = np.full(
                shape=(len(mask), 1), fill_value=True, dtype=bool_dtype,
            )

        return {
            column: AdjustedArray(
                column_view(
                    dense_output[column.name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column.name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
            for column_idx, column in enumerate(columns)
        }
Ejemplo n.º 14
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, odo_kwargs = self[dataset]
        have_sids = SID_FIELD_NAME in expr.fields
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def where(e):
            """Create the query to run against the resources.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.

            Returns
            -------
            q : Expr
                The query to run.
            """
            def lower_for_col(column):
                pred = e[TS_FIELD_NAME] <= lower_dt
                colname = column.name
                schema = e[colname].schema.measure
                if isinstance(schema, Option):
                    pred &= e[colname].notnull()
                    schema = schema.ty
                if schema in floating:
                    pred &= ~e[colname].isnan()

                filtered = e[pred]
                lower = filtered[TS_FIELD_NAME].max()
                if have_sids:
                    # If we have sids, then we need to take the earliest of the
                    # greatest date that has a non-null value by sid.
                    lower = bz.by(
                        filtered[SID_FIELD_NAME],
                        timestamp=lower,
                    ).timestamp.min()
                return lower

            lower = odo(
                reduce(
                    bz.least,
                    map(lower_for_col, columns),
                ),
                pd.Timestamp,
                **odo_kwargs
            )
            if lower is pd.NaT:
                lower = lower_dt
            return e[
                (e[TS_FIELD_NAME] >= lower) &
                (e[TS_FIELD_NAME] <= upper_dt)
            ][added_query_fields + list(map(getname, columns))]

        def collect_expr(e):
            """Execute and merge all of the per-column subqueries.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            df = odo(where(e), pd.DataFrame, **odo_kwargs)
            df.sort(TS_FIELD_NAME, inplace=True)  # sort for the groupby later
            return df

        materialized_expr = collect_expr(expr)
        materialized_deltas = (
            collect_expr(deltas)
            if deltas is not None else
            pd.DataFrame(
                columns=added_query_fields + list(map(getname, columns)),
            )
        )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)
            ]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[
                    :, TS_FIELD_NAME
                ].astype('datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        def last_in_date_group(df, reindex, have_sids=have_sids):
            idx = dates[dates.searchsorted(
                df[TS_FIELD_NAME].values.astype('datetime64[D]')
            )]
            if have_sids:
                idx = [idx, SID_FIELD_NAME]

            last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby(
                idx,
                sort=False,
            ).last()

            if have_sids:
                last_in_group = last_in_group.unstack()

            if reindex:
                if have_sids:
                    cols = last_in_group.columns
                    last_in_group = last_in_group.reindex(
                        index=dates,
                        columns=pd.MultiIndex.from_product(
                            (cols.levels[0], assets),
                            names=cols.names,
                        ),
                    )
                else:
                    last_in_group = last_in_group.reindex(dates)

            return last_in_group

        sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
        dense_output = last_in_date_group(sparse_output, reindex=True)
        dense_output.ffill(inplace=True)

        # Fill in missing values specified by each column. This is made
        # significantly more complex by the fact that we need to work around
        # two pandas issues:

        # 1) When we have sids, if there are no records for a given sid for any
        #    dates, pandas will generate a column full of NaNs for that sid.
        #    This means that some of the columns in `dense_output` are now
        #    float instead of the intended dtype, so we have to coerce back to
        #    our expected type and convert NaNs into the desired missing value.

        # 2) DataFrame.ffill assumes that receiving None as a fill-value means
        #    that no value was passed.  Consequently, there's no way to tell
        #    pandas to replace NaNs in an object column with None using fillna,
        #    so we have to roll our own instead using df.where.
        for column in columns:
            # Special logic for strings since `fillna` doesn't work if the
            # missing value is `None`.
            if column.dtype == categorical_dtype:
                dense_output[column.name] = dense_output[
                    column.name
                ].where(pd.notnull(dense_output[column.name]),
                        column.missing_value)
            else:
                # We need to execute `fillna` before `astype` in case the
                # column contains NaNs and needs to be cast to bool or int.
                # This is so that the NaNs are replaced first, since pandas
                # can't convert NaNs for those types.
                dense_output[column.name] = dense_output[
                    column.name
                ].fillna(column.missing_value).astype(column.dtype)

        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # We use the column view to make an array per asset.
            column_view = compose(
                # We need to copy this because we need a concrete ndarray.
                # The `repeat_last_axis` call will give us a fancy strided
                # array which uses a buffer to represent `len(assets)` columns.
                # The engine puts nans at the indicies for which we do not have
                # sid information so that the nan-aware reductions still work.
                # A future change to the engine would be to add first class
                # support for macro econimic datasets.
                copy,
                partial(repeat_last_axis, count=len(assets)),
            )
            adjustments_from_deltas = adjustments_from_deltas_no_sids

        for column_idx, column in enumerate(columns):
            column_name = column.name
            yield column, AdjustedArray(
                column_view(
                    dense_output[column_name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column_name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
Ejemplo n.º 15
0
def load_raw_data(assets,
                  dates,
                  data_query_time,
                  data_query_tz,
                  expr,
                  odo_kwargs,
                  checkpoints=None):
    """
    Given an expression representing data to load, perform normalization and
    forward-filling and return the data, materialized. Only accepts data with a
    `sid` field.

    Parameters
    ----------
    assets : pd.int64index
        the assets to load data for.
    dates : pd.datetimeindex
        the simulation dates to load data for.
    data_query_time : datetime.time
        the time used as cutoff for new information.
    data_query_tz : tzinfo
        the timezone to normalize your dates to before comparing against
        `time`.
    expr : expr
        the expression representing the data to load.
    odo_kwargs : dict
        extra keyword arguments to pass to odo when executing the expression.
    checkpoints : expr, optional
        the expression representing the checkpointed data for `expr`.

    Returns
    -------
    raw : pd.dataframe
        The result of computing expr and materializing the result as a
        dataframe.
    """
    lower_dt, upper_dt = normalize_data_query_bounds(
        dates[0],
        dates[-1],
        data_query_time,
        data_query_tz,
    )
    raw = ffill_query_in_range(
        expr,
        lower_dt,
        upper_dt,
        checkpoints=checkpoints,
        odo_kwargs=odo_kwargs,
    )
    sids = raw[SID_FIELD_NAME]
    raw.drop(
        sids[~sids.isin(assets)].index,
        inplace=True
    )
    if data_query_time is not None:
        normalize_timestamp_to_query_time(
            raw,
            data_query_time,
            data_query_tz,
            inplace=True,
            ts_field=TS_FIELD_NAME,
        )
    return raw