Example #1
0
 def load_adjusted_array(self, domain, columns, dates, sids, mask):
     return merge(
         self.pool.imap_unordered(
             partial(self._load_dataset, dates, sids, mask),
             itervalues(groupby(getitem(self._table_expressions), columns)),
         ),
     )
Example #2
0
    def _lookup_most_recent_symbols(self, sids):
        symbol_cols = self.equity_symbol_mappings.c
        symbols = {
            row.sid: {c: row[c] for c in symbol_columns}
            for row in concat(
                self.engine.execute(
                    sa.select(
                        (symbol_cols.sid,) +
                        tuple(map(op.getitem(symbol_cols), symbol_columns)),
                    ).where(
                        symbol_cols.sid.in_(map(int, sid_group)),
                    ).order_by(
                        symbol_cols.end_date.desc(),
                    ).group_by(
                        symbol_cols.sid,
                    )
                ).fetchall()
                for sid_group in partition_all(
                    SQLITE_MAX_VARIABLE_NUMBER,
                    sids
                ),
            )
        }

        if len(symbols) != len(sids):
            raise EquitiesNotFound(
                sids=set(sids) - set(symbols),
                plural=True,
            )
        return symbols
    def test_read_no_adjustments(self):
        adjustment_reader = NullAdjustmentReader()
        columns = [USEquityPricing.close, USEquityPricing.volume]
        query_days = self.calendar_days_between(TEST_QUERY_START, TEST_QUERY_STOP)
        # Our expected results for each day are based on values from the
        # previous day.
        shifted_query_days = self.calendar_days_between(TEST_QUERY_START, TEST_QUERY_STOP, shift=-1)

        adjustments = adjustment_reader.load_adjustments(columns, query_days, self.assets)
        self.assertEqual(adjustments, [{}, {}])

        baseline_reader = BcolzDailyBarReader(self.bcolz_path)
        pricing_loader = USEquityPricingLoader(baseline_reader, adjustment_reader)

        results = pricing_loader.load_adjusted_array(
            columns, dates=query_days, assets=self.assets, mask=ones((len(query_days), len(self.assets)), dtype=bool)
        )
        closes, volumes = map(getitem(results), columns)

        expected_baseline_closes = self.bcolz_writer.expected_values_2d(shifted_query_days, self.assets, "close")
        expected_baseline_volumes = self.bcolz_writer.expected_values_2d(shifted_query_days, self.assets, "volume")

        # AdjustedArrays should yield the same data as the expected baseline.
        for windowlen in range(1, len(query_days) + 1):
            for offset, window in enumerate(closes.traverse(windowlen)):
                assert_array_equal(expected_baseline_closes[offset : offset + windowlen], window)

            for offset, window in enumerate(volumes.traverse(windowlen)):
                assert_array_equal(expected_baseline_volumes[offset : offset + windowlen], window)

        # Verify that we checked up to the longest possible window.
        with self.assertRaises(WindowLengthTooLong):
            closes.traverse(windowlen + 1)
        with self.assertRaises(WindowLengthTooLong):
            volumes.traverse(windowlen + 1)
Example #4
0
    def _lookup_most_recent_symbols(self, sids):
        symbol_cols = self.equity_symbol_mappings.c

        symbols = {
            row.sid: {c: row[c] for c in symbol_columns}
            for row in self.engine.execute(
                sa.select(
                    (symbol_cols.sid,) +
                    tuple(map(op.getitem(symbol_cols), symbol_columns)),
                ).where(
                    symbol_cols.sid.in_(map(int, sids)),
                ).order_by(
                    symbol_cols.end_date.desc(),
                ).group_by(
                    symbol_cols.sid,
                )
            ).fetchall()
        }

        if len(symbols) != len(sids):
            raise EquitiesNotFound(
                sids=set(sids) - set(symbols),
                plural=True,
            )
        return symbols
    def test_read_with_adjustments(self):
        columns = [USEquityPricing.high, USEquityPricing.volume]
        query_days = self.calendar_days_between(TEST_QUERY_START, TEST_QUERY_STOP)
        # Our expected results for each day are based on values from the
        # previous day.
        shifted_query_days = self.calendar_days_between(TEST_QUERY_START, TEST_QUERY_STOP, shift=-1)

        baseline_reader = BcolzDailyBarReader(self.bcolz_path)
        adjustment_reader = SQLiteAdjustmentReader(self.db_path)
        pricing_loader = USEquityPricingLoader(baseline_reader, adjustment_reader)

        results = pricing_loader.load_adjusted_array(
            columns, dates=query_days, assets=Int64Index(arange(1, 7)), mask=ones((len(query_days), 6), dtype=bool)
        )
        highs, volumes = map(getitem(results), columns)

        expected_baseline_highs = self.bcolz_writer.expected_values_2d(shifted_query_days, self.assets, "high")
        expected_baseline_volumes = self.bcolz_writer.expected_values_2d(shifted_query_days, self.assets, "volume")

        # At each point in time, the AdjustedArrays should yield the baseline
        # with all adjustments up to that date applied.
        for windowlen in range(1, len(query_days) + 1):
            for offset, window in enumerate(highs.traverse(windowlen)):
                baseline = expected_baseline_highs[offset : offset + windowlen]
                baseline_dates = query_days[offset : offset + windowlen]
                expected_adjusted_highs = self.apply_adjustments(
                    baseline_dates,
                    self.assets,
                    baseline,
                    # Apply all adjustments.
                    concat([SPLITS, MERGERS, DIVIDENDS_EXPECTED], ignore_index=True),
                )
                assert_allclose(expected_adjusted_highs, window)

            for offset, window in enumerate(volumes.traverse(windowlen)):
                baseline = expected_baseline_volumes[offset : offset + windowlen]
                baseline_dates = query_days[offset : offset + windowlen]
                # Apply only splits and invert the ratio.
                adjustments = SPLITS.copy()
                adjustments.ratio = 1 / adjustments.ratio

                expected_adjusted_volumes = self.apply_adjustments(baseline_dates, self.assets, baseline, adjustments)
                # FIXME: Make AdjustedArray properly support integral types.
                assert_array_equal(expected_adjusted_volumes, window.astype(uint32))

        # Verify that we checked up to the longest possible window.
        with self.assertRaises(WindowLengthTooLong):
            highs.traverse(windowlen + 1)
        with self.assertRaises(WindowLengthTooLong):
            volumes.traverse(windowlen + 1)
Example #6
0
 def load_adjusted_array(self, domain, columns, dates, sids, mask):
     data_query_cutoff_times = domain.data_query_cutoff_for_sessions(
         dates, )
     return merge(
         self.pool.imap_unordered(
             partial(
                 self._load_dataset,
                 dates,
                 data_query_cutoff_times,
                 sids,
                 mask,
             ),
             itervalues(groupby(getitem(self._table_expressions), columns)),
         ), )
Example #7
0
 def load_adjusted_array(self, columns, dates, assets, mask):
     return map(
         op.getitem(
             dict(concat(map(
                 partial(
                     self._load_dataset,
                     dates,
                     assets,
                     mask
                 ),
                 itervalues(groupby(getdataset, columns))
             ))),
         ),
         columns,
     )
Example #8
0
 def load_adjusted_array(self, domain, columns, dates, sids, mask):
     data_query_cutoff_times = domain.data_query_cutoff_for_sessions(
         dates,
     )
     return merge(
         self.pool.imap_unordered(
             partial(
                 self._load_dataset,
                 dates,
                 data_query_cutoff_times,
                 sids,
                 mask,
             ),
             itervalues(groupby(getitem(self._table_expressions), columns)),
         ),
     )
Example #9
0
    def _lookup_most_recent_symbols(self, sids):
        symbol_cols = self.equity_symbol_mappings.c
        symbols = {
            row.sid: {c: row[c] for c in symbol_columns}
            for row in concat(
                self.engine.execute(
                    sa.select((symbol_cols.sid,) + tuple(map(op.getitem(symbol_cols), symbol_columns)))
                    .where(symbol_cols.sid.in_(map(int, sid_group)))
                    .order_by(symbol_cols.end_date.desc())
                    .group_by(symbol_cols.sid)
                ).fetchall()
                for sid_group in partition_all(SQLITE_MAX_VARIABLE_NUMBER, sids)
            )
        }

        if len(symbols) != len(sids):
            raise EquitiesNotFound(sids=set(sids) - set(symbols), plural=True)
        return symbols
Example #10
0
    def _select_most_recent_symbols_chunk(self, sid_group):
        """Retrieve the most recent symbol for a set of sids.

        Parameters
        ----------
        sid_group : iterable[int]
            The sids to lookup. The length of this sequence must be less than
            or equal to SQLITE_MAX_VARIABLE_NUMBER because the sids will be
            passed in as sql bind params.

        Returns
        -------
        sel : Selectable
            The sqlalchemy selectable that will query for the most recent
            symbol for each sid.

        Notes
        -----
        This is implemented as an inner select of the columns of interest
        ordered by the end date of the (sid, symbol) mapping. We then group
        that inner select on the sid with no aggregations to select the last
        row per group which gives us the most recently active symbol for all
        of the sids.
        """
        symbol_cols = self.equity_symbol_mappings.c
        inner = sa.select(
            (symbol_cols.sid,) +
            tuple(map(
                op.getitem(symbol_cols),
                symbol_columns,
            )),
        ).where(
            symbol_cols.sid.in_(map(int, sid_group)),
        ).order_by(
            symbol_cols.end_date.asc(),
        )
        return sa.select(inner.c).group_by(inner.c.sid)
Example #11
0
    def _select_most_recent_symbols_chunk(self, sid_group):
        """Retrieve the most recent symbol for a set of sids.

        Parameters
        ----------
        sid_group : iterable[int]
            The sids to lookup. The length of this sequence must be less than
            or equal to SQLITE_MAX_VARIABLE_NUMBER because the sids will be
            passed in as sql bind params.

        Returns
        -------
        sel : Selectable
            The sqlalchemy selectable that will query for the most recent
            symbol for each sid.

        Notes
        -----
        This is implemented as an inner select of the columns of interest
        ordered by the end date of the (sid, symbol) mapping. We then group
        that inner select on the sid with no aggregations to select the last
        row per group which gives us the most recently active symbol for all
        of the sids.
        """
        symbol_cols = self.equity_symbol_mappings.c
        inner = sa.select(
            (symbol_cols.sid,) +
            tuple(map(
                op.getitem(symbol_cols),
                symbol_columns,
            )),
        ).where(
            symbol_cols.sid.in_(map(int, sid_group)),
        ).order_by(
            symbol_cols.end_date.asc(),
        )
        return sa.select(inner.c).group_by(inner.c.sid)
def _build_preprocessed_function(func,
                                 processors,
                                 args_defaults,
                                 varargs,
                                 varkw):
    """
    Build a preprocessed function with the same signature as `func`.

    Uses `exec` internally to build a function that actually has the same
    signature as `func.
    """
    format_kwargs = {'func_name': func.__name__}

    def mangle(name):
        return 'a' + uuid4().hex + name

    format_kwargs['mangled_func'] = mangled_funcname = mangle(func.__name__)

    def make_processor_assignment(arg, processor_name):
        template = "{arg} = {processor}({func}, '{arg}', {arg})"
        return template.format(
            arg=arg,
            processor=processor_name,
            func=mangled_funcname,
        )

    exec_globals = {mangled_funcname: func, 'wraps': wraps}
    defaults_seen = 0
    default_name_template = 'a' + uuid4().hex + '_%d'
    signature = []
    call_args = []
    assignments = []
    star_map = {
        varargs: '*',
        varkw: '**',
    }

    def name_as_arg(arg):
        return star_map.get(arg, '') + arg

    for arg, default in args_defaults:
        if default is NO_DEFAULT:
            signature.append(name_as_arg(arg))
        else:
            default_name = default_name_template % defaults_seen
            exec_globals[default_name] = default
            signature.append('='.join([name_as_arg(arg), default_name]))
            defaults_seen += 1

        if arg in processors:
            procname = mangle('_processor_' + arg)
            exec_globals[procname] = processors[arg]
            assignments.append(make_processor_assignment(arg, procname))

        call_args.append(name_as_arg(arg))

    exec_str = dedent(
        """\
        @wraps({wrapped_funcname})
        def {func_name}({signature}):
            {assignments}
            return {wrapped_funcname}({call_args})
        """
    ).format(
        func_name=func.__name__,
        signature=', '.join(signature),
        assignments='\n    '.join(assignments),
        wrapped_funcname=mangled_funcname,
        call_args=', '.join(call_args),
    )
    compiled = compile(
        exec_str,
        func.__code__.co_filename,
        mode='exec',
    )

    exec_locals = {}
    exec_(compiled, exec_globals, exec_locals)
    new_func = exec_locals[func.__name__]

    code = new_func.__code__
    args = {
        attr: getattr(code, attr)
        for attr in dir(code)
        if attr.startswith('co_')
    }
    # Copy the firstlineno out of the underlying function so that exceptions
    # get raised with the correct traceback.
    # This also makes dynamic source inspection (like IPython `??` operator)
    # work as intended.
    try:
        # Try to get the pycode object from the underlying function.
        original_code = func.__code__
    except AttributeError:
        try:
            # The underlying callable was not a function, try to grab the
            # `__func__.__code__` which exists on method objects.
            original_code = func.__func__.__code__
        except AttributeError:
            # The underlying callable does not have a `__code__`. There is
            # nothing for us to correct.
            return new_func

    args['co_firstlineno'] = original_code.co_firstlineno
    new_func.__code__ = CodeType(*map(getitem(args), _code_argorder))
    return new_func
    def test_read_with_adjustments(self):
        columns = [USEquityPricing.high, USEquityPricing.volume]
        query_days = self.calendar_days_between(TEST_QUERY_START,
                                                TEST_QUERY_STOP)
        # Our expected results for each day are based on values from the
        # previous day.
        shifted_query_days = self.calendar_days_between(
            TEST_QUERY_START,
            TEST_QUERY_STOP,
            shift=-1,
        )

        pricing_loader = USEquityPricingLoader(
            self.bcolz_equity_daily_bar_reader,
            self.adjustment_reader,
        )

        results = pricing_loader.load_adjusted_array(
            columns,
            dates=query_days,
            assets=Int64Index(arange(1, 7)),
            mask=ones((len(query_days), 6), dtype=bool),
        )
        highs, volumes = map(getitem(results), columns)

        expected_baseline_highs = expected_bar_values_2d(
            shifted_query_days,
            self.asset_info,
            'high',
        )
        expected_baseline_volumes = expected_bar_values_2d(
            shifted_query_days,
            self.asset_info,
            'volume',
        )

        # At each point in time, the AdjustedArrays should yield the baseline
        # with all adjustments up to that date applied.
        for windowlen in range(1, len(query_days) + 1):
            for offset, window in enumerate(highs.traverse(windowlen)):
                baseline = expected_baseline_highs[offset:offset + windowlen]
                baseline_dates = query_days[offset:offset + windowlen]
                expected_adjusted_highs = self.apply_adjustments(
                    baseline_dates,
                    self.assets,
                    baseline,
                    # Apply all adjustments.
                    concat([SPLITS, MERGERS, DIVIDENDS_EXPECTED],
                           ignore_index=True),
                )
                assert_allclose(expected_adjusted_highs, window)

            for offset, window in enumerate(volumes.traverse(windowlen)):
                baseline = expected_baseline_volumes[offset:offset + windowlen]
                baseline_dates = query_days[offset:offset + windowlen]
                # Apply only splits and invert the ratio.
                adjustments = SPLITS.copy()
                adjustments.ratio = 1 / adjustments.ratio

                expected_adjusted_volumes = self.apply_adjustments(
                    baseline_dates,
                    self.assets,
                    baseline,
                    adjustments,
                )
                # FIXME: Make AdjustedArray properly support integral types.
                assert_array_equal(
                    expected_adjusted_volumes,
                    window.astype(uint32),
                )

        # Verify that we checked up to the longest possible window.
        with self.assertRaises(WindowLengthTooLong):
            highs.traverse(windowlen + 1)
        with self.assertRaises(WindowLengthTooLong):
            volumes.traverse(windowlen + 1)
    def test_read_no_adjustments(self):
        adjustment_reader = NullAdjustmentReader()
        columns = [USEquityPricing.close, USEquityPricing.volume]
        query_days = self.calendar_days_between(TEST_QUERY_START,
                                                TEST_QUERY_STOP)
        # Our expected results for each day are based on values from the
        # previous day.
        shifted_query_days = self.calendar_days_between(
            TEST_QUERY_START,
            TEST_QUERY_STOP,
            shift=-1,
        )

        adjustments = adjustment_reader.load_adjustments(
            [c.name for c in columns],
            query_days,
            self.assets,
        )
        self.assertEqual(adjustments, [{}, {}])

        pricing_loader = USEquityPricingLoader(
            self.bcolz_equity_daily_bar_reader,
            adjustment_reader,
        )

        results = pricing_loader.load_adjusted_array(
            columns,
            dates=query_days,
            assets=self.assets,
            mask=ones((len(query_days), len(self.assets)), dtype=bool),
        )
        closes, volumes = map(getitem(results), columns)

        expected_baseline_closes = expected_bar_values_2d(
            shifted_query_days,
            self.asset_info,
            'close',
        )
        expected_baseline_volumes = expected_bar_values_2d(
            shifted_query_days,
            self.asset_info,
            'volume',
        )

        # AdjustedArrays should yield the same data as the expected baseline.
        for windowlen in range(1, len(query_days) + 1):
            for offset, window in enumerate(closes.traverse(windowlen)):
                assert_array_equal(
                    expected_baseline_closes[offset:offset + windowlen],
                    window,
                )

            for offset, window in enumerate(volumes.traverse(windowlen)):
                assert_array_equal(
                    expected_baseline_volumes[offset:offset + windowlen],
                    window,
                )

        # Verify that we checked up to the longest possible window.
        with self.assertRaises(WindowLengthTooLong):
            closes.traverse(windowlen + 1)
        with self.assertRaises(WindowLengthTooLong):
            volumes.traverse(windowlen + 1)
Example #15
0
    def compute_chunk(self, graph, dates, assets, initial_workspace):
        """
        Compute the Pipeline terms in the graph for the requested start and end
        dates.

        Parameters
        ----------
        graph : zipline.pipeline.graph.TermGraph
        dates : pd.DatetimeIndex
            Row labels for our root mask.
        assets : pd.Int64Index
            Column labels for our root mask.
        initial_workspace : dict
            Map from term -> output.
            Must contain at least entry for `self._root_mask_term` whose shape
            is `(len(dates), len(assets))`, but may contain additional
            pre-computed terms for testing or optimization purposes.

        Returns
        -------
        results : dict
            Dictionary mapping requested results to outputs.
        """
        self._validate_compute_chunk_params(dates, assets, initial_workspace)
        get_loader = self.get_loader

        # Copy the supplied initial workspace so we don't mutate it in place.
        workspace = initial_workspace.copy()
        refcounts = graph.initial_refcounts(workspace)
        execution_order = graph.execution_order(refcounts)

        # If loadable terms share the same loader and extra_rows, load them all
        # together.
        loadable_terms = graph.loadable_terms
        loader_group_key = juxt(get_loader, getitem(graph.extra_rows))
        loader_groups = groupby(
            loader_group_key,
            # Only produce loader groups for the terms we expect to load.  This
            # ensures that we can run pipelines for graphs where we don't have
            # a loader registered for an atomic term if all the dependencies of
            # that term were supplied in the initial workspace.
            (t for t in execution_order if t in loadable_terms),
        )

        for term in graph.execution_order(refcounts):
            # `term` may have been supplied in `initial_workspace`, and in the
            # future we may pre-compute loadable terms coming from the same
            # dataset.  In either case, we will already have an entry for this
            # term, which we shouldn't re-compute.
            if term in workspace:
                continue

            # Asset labels are always the same, but date labels vary by how
            # many extra rows are needed.
            mask, mask_dates = graph.mask_and_dates_for_term(
                term,
                self._root_mask_term,
                workspace,
                dates,
            )

            if isinstance(term, LoadableTerm):
                to_load = sorted(
                    loader_groups[loader_group_key(term)],
                    key=lambda t: t.dataset
                )
                loader = get_loader(term)
                loaded = loader.load_adjusted_array(
                    to_load, mask_dates, assets, mask,
                )
                assert set(loaded) == set(to_load), (
                    'loader did not return an AdjustedArray for each column\n'
                    'expected: %r\n'
                    'got:      %r' % (sorted(to_load), sorted(loaded))
                )
                workspace.update(loaded)
            else:
                workspace[term] = term._compute(
                    self._inputs_for_term(term, workspace, graph),
                    mask_dates,
                    assets,
                    mask,
                )
                if term.ndim == 2:
                    assert workspace[term].shape == mask.shape
                else:
                    assert workspace[term].shape == (mask.shape[0], 1)

                # Decref dependencies of ``term``, and clear any terms whose
                # refcounts hit 0.
                for garbage_term in graph.decref_dependencies(term, refcounts):
                    del workspace[garbage_term]

        out = {}
        graph_extra_rows = graph.extra_rows
        for name, term in iteritems(graph.outputs):
            # Truncate off extra rows from outputs.
            out[name] = workspace[term][graph_extra_rows[term]:]
        return out
Example #16
0
def _build_preprocessed_function(func, processors, args_defaults, varargs,
                                 varkw):
    """
    Build a preprocessed function with the same signature as `func`.

    Uses `exec` internally to build a function that actually has the same
    signature as `func.
    """
    format_kwargs = {'func_name': func.__name__}

    def mangle(name):
        return 'a' + uuid4().hex + name

    format_kwargs['mangled_func'] = mangled_funcname = mangle(func.__name__)

    def make_processor_assignment(arg, processor_name):
        template = "{arg} = {processor}({func}, '{arg}', {arg})"
        return template.format(
            arg=arg,
            processor=processor_name,
            func=mangled_funcname,
        )

    exec_globals = {mangled_funcname: func, 'wraps': wraps}
    defaults_seen = 0
    default_name_template = 'a' + uuid4().hex + '_%d'
    signature = []
    call_args = []
    assignments = []
    star_map = {
        varargs: '*',
        varkw: '**',
    }

    def name_as_arg(arg):
        return star_map.get(arg, '') + arg

    for arg, default in args_defaults:
        if default is NO_DEFAULT:
            signature.append(name_as_arg(arg))
        else:
            default_name = default_name_template % defaults_seen
            exec_globals[default_name] = default
            signature.append('='.join([name_as_arg(arg), default_name]))
            defaults_seen += 1

        if arg in processors:
            procname = mangle('_processor_' + arg)
            exec_globals[procname] = processors[arg]
            assignments.append(make_processor_assignment(arg, procname))

        call_args.append(name_as_arg(arg))

    exec_str = dedent("""\
        @wraps({wrapped_funcname})
        def {func_name}({signature}):
            {assignments}
            return {wrapped_funcname}({call_args})
        """).format(
        func_name=func.__name__,
        signature=', '.join(signature),
        assignments='\n    '.join(assignments),
        wrapped_funcname=mangled_funcname,
        call_args=', '.join(call_args),
    )
    compiled = compile(
        exec_str,
        func.__code__.co_filename,
        mode='exec',
    )

    exec_locals = {}
    exec(compiled, exec_globals, exec_locals)
    new_func = exec_locals[func.__name__]

    code = new_func.__code__
    args = {
        attr: getattr(code, attr)
        for attr in dir(code) if attr.startswith('co_')
    }
    # Copy the firstlineno out of the underlying function so that exceptions
    # get raised with the correct traceback.
    # This also makes dynamic source inspection (like IPython `??` operator)
    # work as intended.
    try:
        # Try to get the pycode object from the underlying function.
        original_code = func.__code__
    except AttributeError:
        try:
            # The underlying callable was not a function, try to grab the
            # `__func__.__code__` which exists on method objects.
            original_code = func.__func__.__code__
        except AttributeError:
            # The underlying callable does not have a `__code__`. There is
            # nothing for us to correct.
            return new_func

    args['co_firstlineno'] = original_code.co_firstlineno
    new_func.__code__ = CodeType(*map(getitem(args), _code_argorder))
    return new_func
Example #17
0
    def compute_chunk(self, graph, dates, assets, initial_workspace):
        """
        Compute the Pipeline terms in the graph for the requested start and end
        dates.

        Parameters
        ----------
        graph : zipline.pipeline.graph.TermGraph
        dates : pd.DatetimeIndex
            Row labels for our root mask.
        assets : pd.Int64Index
            Column labels for our root mask.
        initial_workspace : dict
            Map from term -> output.
            Must contain at least entry for `self._root_mask_term` whose shape
            is `(len(dates), len(assets))`, but may contain additional
            pre-computed terms for testing or optimization purposes.

        Returns
        -------
        results : dict
            Dictionary mapping requested results to outputs.
        """
        self._validate_compute_chunk_params(dates, assets, initial_workspace)
        get_loader = self.get_loader

        # Copy the supplied initial workspace so we don't mutate it in place.
        workspace = initial_workspace.copy()

        # If loadable terms share the same loader and extra_rows, load them all
        # together.
        loader_group_key = juxt(get_loader, getitem(graph.extra_rows))
        loader_groups = groupby(loader_group_key, graph.loadable_terms)

        for term in graph.ordered():
            # `term` may have been supplied in `initial_workspace`, and in the
            # future we may pre-compute loadable terms coming from the same
            # dataset.  In either case, we will already have an entry for this
            # term, which we shouldn't re-compute.
            if term in workspace:
                continue

            # Asset labels are always the same, but date labels vary by how
            # many extra rows are needed.
            mask, mask_dates = self._mask_and_dates_for_term(
                term, workspace, graph, dates)

            if isinstance(term, LoadableTerm):
                to_load = sorted(loader_groups[loader_group_key(term)],
                                 key=lambda t: t.dataset)
                loader = get_loader(term)
                loaded = loader.load_adjusted_array(
                    to_load,
                    mask_dates,
                    assets,
                    mask,
                )
                workspace.update(loaded)
            else:
                workspace[term] = term._compute(
                    self._inputs_for_term(term, workspace, graph),
                    mask_dates,
                    assets,
                    mask,
                )
                assert (workspace[term].shape == mask.shape)

        out = {}
        graph_extra_rows = graph.extra_rows
        for name, term in iteritems(graph.outputs):
            # Truncate off extra rows from outputs.
            out[name] = workspace[term][graph_extra_rows[term]:]
        return out
Example #18
0
    def compute_chunk(self, graph, dates, assets, initial_workspace):
        """
        Compute the Pipeline terms in the graph for the requested start and end
        dates.

        Parameters
        ----------
        graph : zipline.pipeline.graph.TermGraph
        dates : pd.DatetimeIndex
            Row labels for our root mask.
        assets : pd.Int64Index
            Column labels for our root mask.
        initial_workspace : dict
            Map from term -> output.
            Must contain at least entry for `self._root_mask_term` whose shape
            is `(len(dates), len(assets))`, but may contain additional
            pre-computed terms for testing or optimization purposes.

        Returns
        -------
        results : dict
            Dictionary mapping requested results to outputs.
        """
        self._validate_compute_chunk_params(dates, assets, initial_workspace)
        get_loader = self.get_loader

        # Copy the supplied initial workspace so we don't mutate it in place.
        workspace = initial_workspace.copy()

        # If loadable terms share the same loader and extra_rows, load them all
        # together.
        loader_group_key = juxt(get_loader, getitem(graph.extra_rows))
        loader_groups = groupby(loader_group_key, graph.loadable_terms)

        for term in graph.ordered():
            # `term` may have been supplied in `initial_workspace`, and in the
            # future we may pre-compute loadable terms coming from the same
            # dataset.  In either case, we will already have an entry for this
            # term, which we shouldn't re-compute.
            if term in workspace:
                continue

            # Asset labels are always the same, but date labels vary by how
            # many extra rows are needed.
            mask, mask_dates = self._mask_and_dates_for_term(
                term, workspace, graph, dates
            )

            if isinstance(term, LoadableTerm):
                to_load = sorted(
                    loader_groups[loader_group_key(term)],
                    key=lambda t: t.dataset
                )
                loader = get_loader(term)
                loaded = loader.load_adjusted_array(
                    to_load, mask_dates, assets, mask,
                )
                workspace.update(loaded)
            else:
                workspace[term] = term._compute(
                    self._inputs_for_term(term, workspace, graph),
                    mask_dates,
                    assets,
                    mask,
                )
                assert(workspace[term].shape == mask.shape)

        out = {}
        graph_extra_rows = graph.extra_rows
        for name, term in iteritems(graph.outputs):
            # Truncate off extra rows from outputs.
            out[name] = workspace[term][graph_extra_rows[term]:]
        return out
    def test_read_no_adjustments(self):
        adjustment_reader = NullAdjustmentReader()
        columns = [USEquityPricing.close, USEquityPricing.volume]
        query_days = self.calendar_days_between(
            TEST_QUERY_START,
            TEST_QUERY_STOP
        )
        # Our expected results for each day are based on values from the
        # previous day.
        shifted_query_days = self.calendar_days_between(
            TEST_QUERY_START,
            TEST_QUERY_STOP,
            shift=-1,
        )

        adjustments = adjustment_reader.load_pricing_adjustments(
            [c.name for c in columns],
            query_days,
            self.sids,
        )
        self.assertEqual(adjustments, [{}, {}])

        pricing_loader = USEquityPricingLoader(
            self.bcolz_equity_daily_bar_reader,
            adjustment_reader,
        )

        results = pricing_loader.load_adjusted_array(
            domain=US_EQUITIES,
            columns=columns,
            dates=query_days,
            sids=self.sids,
            mask=ones((len(query_days), len(self.sids)), dtype=bool),
        )
        closes, volumes = map(getitem(results), columns)

        expected_baseline_closes = expected_bar_values_2d(
            shifted_query_days,
            self.sids,
            self.asset_info,
            'close',
        )
        expected_baseline_volumes = expected_bar_values_2d(
            shifted_query_days,
            self.sids,
            self.asset_info,
            'volume',
        )

        # AdjustedArrays should yield the same data as the expected baseline.
        for windowlen in range(1, len(query_days) + 1):
            for offset, window in enumerate(closes.traverse(windowlen)):
                assert_array_equal(
                    expected_baseline_closes[offset:offset + windowlen],
                    window,
                )

            for offset, window in enumerate(volumes.traverse(windowlen)):
                assert_array_equal(
                    expected_baseline_volumes[offset:offset + windowlen],
                    window,
                )

        # Verify that we checked up to the longest possible window.
        with self.assertRaises(WindowLengthTooLong):
            closes.traverse(windowlen + 1)
        with self.assertRaises(WindowLengthTooLong):
            volumes.traverse(windowlen + 1)