def test_produce_nans_when_too_much_missing_data(self, nan_offset):
        rand = np.random.RandomState(42)

        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independent = as_column(np.linspace(-5., 5., 30))
        noise = as_column(rand.uniform(-.1, .1, 30))
        dependents = 1.0 + true_betas * independent + noise

        # Write nans in a triangular pattern into the middle of the dependent
        # array.
        nan_grid = np.array([[1, 0, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 1, 0, 0],
                             [1, 1, 1, 1, 0], [1, 1, 1, 1, 1]],
                            dtype=bool)
        num_nans = nan_grid.sum(axis=0)
        # Move the grid around in the parameterized tests. The positions
        # shouldn't matter.
        dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan

        for allowed_missing in range(7):
            results = vectorized_beta(dependents, independent, allowed_missing)
            for i, expected in enumerate(true_betas):
                result = results[i]
                expect_nan = num_nans[i] > allowed_missing
                true_beta = true_betas[i]
                if expect_nan:
                    self.assertTrue(np.isnan(result))
                else:
                    self.assertTrue(np.abs(result - true_beta) < 0.01)
    def test_produce_nans_when_too_much_missing_data(self, seed, nans,
                                                     nan_offset):
        rand = np.random.RandomState(seed)

        betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independents = as_column(np.linspace(-5., 5., 30)) + np.arange(5)
        noise = as_column(rand.uniform(-2, 2, 30))
        dependents = 1.0 + betas * independents + noise

        # Write nans in a triangular pattern into the middle of the dependent
        # array.
        nan_grid = np.array([[1, 1, 1, 1, 1], [0, 1, 1, 1, 1], [0, 0, 1, 1, 1],
                             [0, 0, 0, 1, 1], [0, 0, 0, 0, 1]],
                            dtype=bool)

        if nans == 'dependent' or nans == 'both':
            dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan
        if nans == 'independent' or nans == 'both':
            independents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan

        expected = self.naive_columnwise_pearson(dependents, independents)
        for allowed_missing in list(range(7)) + [10000]:
            results = vectorized_pearson_r(dependents, independents,
                                           allowed_missing)
            for i, result in enumerate(results):
                # column i has i + 1 missing values.
                if i + 1 > allowed_missing:
                    self.assertTrue(np.isnan(result))
                else:
                    assert_equal(result, expected[i])
    def test_produce_nans_when_too_much_missing_data(self, nan_offset):
        rand = np.random.RandomState(42)

        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independent = as_column(np.linspace(-5., 5., 30))
        noise = as_column(rand.uniform(-.1, .1, 30))
        dependents = 1.0 + true_betas * independent + noise

        # Write nans in a triangular pattern into the middle of the dependent
        # array.
        nan_grid = np.array([[1, 0, 0, 0, 0],
                             [1, 1, 0, 0, 0],
                             [1, 1, 1, 0, 0],
                             [1, 1, 1, 1, 0],
                             [1, 1, 1, 1, 1]], dtype=bool)
        num_nans = nan_grid.sum(axis=0)
        # Move the grid around in the parameterized tests. The positions
        # shouldn't matter.
        dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan

        for allowed_missing in range(7):
            results = vectorized_beta(dependents, independent, allowed_missing)
            for i, expected in enumerate(true_betas):
                result = results[i]
                expect_nan = num_nans[i] > allowed_missing
                true_beta = true_betas[i]
                if expect_nan:
                    self.assertTrue(np.isnan(result))
                else:
                    self.assertTrue(np.abs(result - true_beta) < 0.01)
    def test_matches_empyrical_beta_aligned(self, seed):
        rand = np.random.RandomState(seed)

        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independent = as_column(np.linspace(-5., 5., 30))
        noise = as_column(rand.uniform(-.1, .1, 30))
        dependents = 1.0 + true_betas * independent + noise

        result = self.compare_with_empyrical(dependents, independent)
        self.assertTrue((np.abs(result - true_betas) < 0.01).all())
    def test_matches_empyrical_beta_aligned(self, seed):
        rand = np.random.RandomState(seed)

        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independent = as_column(np.linspace(-5., 5., 30))
        noise = as_column(rand.uniform(-.1, .1, 30))
        dependents = 1.0 + true_betas * independent + noise

        result = self.compare_with_empyrical(dependents, independent)
        self.assertTrue((np.abs(result - true_betas) < 0.01).all())
def make_trade_data_for_asset_info(dates,
                                   asset_info,
                                   price_start,
                                   price_step_by_date,
                                   price_step_by_sid,
                                   volume_start,
                                   volume_step_by_date,
                                   volume_step_by_sid,
                                   frequency,
                                   writer=None):
    """
    Convert the asset info dataframe into a dataframe of trade data for each
    sid, and write to the writer if provided. Write NaNs for locations where
    assets did not exist. Return a dict of the dataframes, keyed by sid.
    """
    trade_data = {}
    sids = asset_info.index

    price_sid_deltas = np.arange(len(sids), dtype=float64) * price_step_by_sid
    price_date_deltas = (np.arange(len(dates), dtype=float64) *
                         price_step_by_date)
    prices = (price_sid_deltas + as_column(price_date_deltas)) + price_start

    volume_sid_deltas = np.arange(len(sids)) * volume_step_by_sid
    volume_date_deltas = np.arange(len(dates)) * volume_step_by_date
    volumes = volume_sid_deltas + as_column(volume_date_deltas) + volume_start

    for j, sid in enumerate(sids):
        start_date, end_date = asset_info.loc[sid, ['start_date', 'end_date']]
        # Normalize here so the we still generate non-NaN values on the minutes
        # for an asset's last trading day.
        for i, date in enumerate(dates.normalize()):
            if not (start_date <= date <= end_date):
                prices[i, j] = 0
                volumes[i, j] = 0

        df = pd.DataFrame(
            {
                "open": prices[:, j],
                "high": prices[:, j],
                "low": prices[:, j],
                "close": prices[:, j],
                "volume": volumes[:, j],
            },
            index=dates,
        )

        if writer:
            writer.write_sid(sid, df)

        trade_data[sid] = df

    return trade_data
Exemple #7
0
def make_trade_data_for_asset_info(dates,
                                   asset_info,
                                   price_start,
                                   price_step_by_date,
                                   price_step_by_sid,
                                   volume_start,
                                   volume_step_by_date,
                                   volume_step_by_sid,
                                   frequency,
                                   writer=None):
    """
    Convert the asset info dataframe into a dataframe of trade data for each
    sid, and write to the writer if provided. Write NaNs for locations where
    assets did not exist. Return a dict of the dataframes, keyed by sid.
    """
    trade_data = {}
    sids = asset_info.index

    price_sid_deltas = np.arange(len(sids), dtype=float64) * price_step_by_sid
    price_date_deltas = (np.arange(len(dates), dtype=float64) *
                         price_step_by_date)
    prices = (price_sid_deltas + as_column(price_date_deltas)) + price_start

    volume_sid_deltas = np.arange(len(sids)) * volume_step_by_sid
    volume_date_deltas = np.arange(len(dates)) * volume_step_by_date
    volumes = volume_sid_deltas + as_column(volume_date_deltas) + volume_start

    for j, sid in enumerate(sids):
        start_date, end_date = asset_info.loc[sid, ['start_date', 'end_date']]
        # Normalize here so the we still generate non-NaN values on the minutes
        # for an asset's last trading day.
        for i, date in enumerate(dates.normalize()):
            if not (start_date <= date <= end_date):
                prices[i, j] = 0
                volumes[i, j] = 0

        df = pd.DataFrame(
            {
                "open": prices[:, j],
                "high": prices[:, j],
                "low": prices[:, j],
                "close": prices[:, j],
                "volume": volumes[:, j],
            },
            index=dates,
        )

        if writer:
            writer.write_sid(sid, df)

        trade_data[sid] = df

    return trade_data
Exemple #8
0
    def test_allowed_missing_doesnt_double_count(self):
        # Test that allowed_missing only counts a row as missing one
        # observation if it's missing in both the dependent and independent
        # variable.
        rand = np.random.RandomState(42)
        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independent = as_column(np.linspace(-5.0, 5.0, 30))
        noise = as_column(rand.uniform(-0.1, 0.1, 30))
        dependents = 1.0 + true_betas * independent + noise

        # Each column has three nans in the grid.
        dependent_nan_grid = np.array(
            [
                [0, 1, 1, 1, 0],
                [0, 0, 1, 1, 1],
                [1, 0, 0, 1, 1],
                [1, 1, 0, 0, 1],
                [1, 1, 1, 0, 0],
            ],
            dtype=bool,
        )
        # There are also two nans in the independent data.
        independent_nan_grid = np.array([[0], [0], [1], [1], [0]], dtype=bool)

        dependents[10:15][dependent_nan_grid] = np.nan
        independent[10:15][independent_nan_grid] = np.nan

        # With only two allowed missing values, everything should come up nan,
        # because column has at least 3 nans in the dependent data.
        result2 = vectorized_beta(dependents, independent, allowed_missing=2)
        assert_equal(np.isnan(result2),
                     np.array([True, True, True, True, True]))

        # With three allowed missing values, the first and last columns should
        # produce a value, because they have nans at the same rows where the
        # independent data has nans.
        result3 = vectorized_beta(dependents, independent, allowed_missing=3)
        assert_equal(np.isnan(result3),
                     np.array([False, True, True, True, False]))

        # With four allowed missing values, everything but the middle column
        # should produce a value. The middle column will have 5 nans because
        # the dependent nans have no overlap with the independent nans.
        result4 = vectorized_beta(dependents, independent, allowed_missing=4)
        assert_equal(np.isnan(result4),
                     np.array([False, False, True, False, False]))

        # With five allowed missing values, everything should produce a value.
        result5 = vectorized_beta(dependents, independent, allowed_missing=5)
        assert_equal(np.isnan(result5),
                     np.array([False, False, False, False, False]))
    def test_allowed_missing_doesnt_double_count(self):
        # Test that allowed_missing only counts a row as missing one
        # observation if it's missing in both the dependent and independent
        # variable.
        rand = np.random.RandomState(42)
        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5])
        independent = as_column(np.linspace(-5., 5., 30))
        noise = as_column(rand.uniform(-.1, .1, 30))
        dependents = 1.0 + true_betas * independent + noise

        # Each column has three nans in the grid.
        dependent_nan_grid = np.array([[0, 1, 1, 1, 0],
                                       [0, 0, 1, 1, 1],
                                       [1, 0, 0, 1, 1],
                                       [1, 1, 0, 0, 1],
                                       [1, 1, 1, 0, 0]], dtype=bool)
        # There are also two nans in the independent data.
        independent_nan_grid = np.array([[0],
                                         [0],
                                         [1],
                                         [1],
                                         [0]], dtype=bool)

        dependents[10:15][dependent_nan_grid] = np.nan
        independent[10:15][independent_nan_grid] = np.nan

        # With only two allowed missing values, everything should come up nan,
        # because column has at least 3 nans in the dependent data.
        result2 = vectorized_beta(dependents, independent, allowed_missing=2)
        assert_equal(np.isnan(result2),
                     np.array([True, True, True, True, True]))

        # With three allowed missing values, the first and last columns should
        # produce a value, because they have nans at the same rows where the
        # independent data has nans.
        result3 = vectorized_beta(dependents, independent, allowed_missing=3)
        assert_equal(np.isnan(result3),
                     np.array([False, True, True, True, False]))

        # With four allowed missing values, everything but the middle column
        # should produce a value. The middle column will have 5 nans because
        # the dependent nans have no overlap with the independent nans.
        result4 = vectorized_beta(dependents, independent, allowed_missing=4)
        assert_equal(np.isnan(result4),
                     np.array([False, False, True, False, False]))

        # With five allowed missing values, everything should produce a value.
        result5 = vectorized_beta(dependents, independent, allowed_missing=5)
        assert_equal(np.isnan(result5),
                     np.array([False, False, False, False, False]))
Exemple #10
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        """
        Load data from our stored baseline.
        """
        column = self.column
        if len(columns) != 1:
            raise ValueError(
                "Can't load multiple columns with DataFrameLoader")
        elif columns[0] != column:
            raise ValueError("Can't load unknown column %s" % columns[0])

        date_indexer = self.dates.get_indexer(dates)
        assets_indexer = self.assets.get_indexer(assets)

        # Boolean arrays with True on matched entries
        good_dates = (date_indexer != -1)
        good_assets = (assets_indexer != -1)

        return {
            column:
            AdjustedArray(
                # Pull out requested columns/rows from our baseline data.
                data=self.baseline[ix_(date_indexer, assets_indexer)],
                # Mask out requested columns/rows that didnt match.
                mask=(good_assets & as_column(good_dates)) & mask,
                adjustments=self.format_adjustments(dates, assets),
                missing_value=column.missing_value,
            ),
        }
Exemple #11
0
    def load_adjusted_array(self, domain, columns, dates, sids, mask):
        """
        Load data from our stored baseline.
        """
        if len(columns) != 1:
            raise ValueError(
                "Can't load multiple columns with DataFrameLoader")

        column = columns[0]
        self._validate_input_column(column)

        date_indexer = self.dates.get_indexer(dates)
        assets_indexer = self.assets.get_indexer(sids)

        # Boolean arrays with True on matched entries
        good_dates = (date_indexer != -1)
        good_assets = (assets_indexer != -1)

        data = self.baseline[ix_(date_indexer, assets_indexer)]
        mask = (good_assets & as_column(good_dates)) & mask

        # Mask out requested columns/rows that didn't match.
        data[~mask] = column.missing_value

        return {
            column:
            AdjustedArray(
                # Pull out requested columns/rows from our baseline data.
                data=data,
                adjustments=self.format_adjustments(dates, sids),
                missing_value=column.missing_value,
            ),
        }
Exemple #12
0
    def _run_pipeline_impl(self, pipeline, start_date, end_date, hooks):
        """Shared core for ``run_pipeline`` and ``run_chunked_pipeline``.
        """
        # See notes at the top of this module for a description of the
        # algorithm implemented here.
        if end_date < start_date:
            raise ValueError(
                "start_date must be before or equal to end_date \n"
                "start_date=%s, end_date=%s" % (start_date, end_date))

        domain = self.resolve_domain(pipeline)

        plan = pipeline.to_execution_plan(
            domain,
            self._root_mask_term,
            start_date,
            end_date,
        )
        extra_rows = plan.extra_rows[self._root_mask_term]
        root_mask = self._compute_root_mask(
            domain,
            start_date,
            end_date,
            extra_rows,
        )
        dates, sids, root_mask_values = explode(root_mask)

        workspace = self._populate_initial_workspace(
            {
                self._root_mask_term: root_mask_values,
                self._root_mask_dates_term: as_column(dates.values)
            },
            self._root_mask_term,
            plan,
            dates,
            sids,
        )

        refcounts = plan.initial_refcounts(workspace)
        execution_order = plan.execution_order(workspace, refcounts)

        with hooks.computing_chunk(execution_order, start_date, end_date):

            results = self.compute_chunk(
                graph=plan,
                dates=dates,
                sids=sids,
                workspace=workspace,
                refcounts=refcounts,
                execution_order=execution_order,
                hooks=hooks,
            )

        return self._to_narrow(
            plan.outputs,
            results,
            results.pop(plan.screen_name),
            dates[extra_rows:],
            sids,
        )
Exemple #13
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        """
        Load data from our stored baseline.
        """
        column = self.column
        if len(columns) != 1:
            raise ValueError(
                "Can't load multiple columns with DataFrameLoader"
            )
        elif columns[0] != column:
            raise ValueError("Can't load unknown column %s" % columns[0])

        date_indexer = self.dates.get_indexer(dates)
        assets_indexer = self.assets.get_indexer(assets)

        # Boolean arrays with True on matched entries
        good_dates = (date_indexer != -1)
        good_assets = (assets_indexer != -1)

        return {
            column: AdjustedArray(
                # Pull out requested columns/rows from our baseline data.
                data=self.baseline[ix_(date_indexer, assets_indexer)],
                # Mask out requested columns/rows that didnt match.
                mask=(good_assets & as_column(good_dates)) & mask,
                adjustments=self.format_adjustments(dates, assets),
                missing_value=column.missing_value,
            ),
        }
Exemple #14
0
    def load_adjusted_array(self, domain, columns, dates, sids, mask):
        """
        Load data from our stored baseline.
        """
        if len(columns) != 1:
            raise ValueError(
                "Can't load multiple columns with DataFrameLoader"
            )

        column = columns[0]
        self._validate_input_column(column)

        date_indexer = self.dates.get_indexer(dates)
        assets_indexer = self.assets.get_indexer(sids)

        # Boolean arrays with True on matched entries
        good_dates = (date_indexer != -1)
        good_assets = (assets_indexer != -1)

        data = self.baseline[ix_(date_indexer, assets_indexer)]
        mask = (good_assets & as_column(good_dates)) & mask

        # Mask out requested columns/rows that didn't match.
        data[~mask] = column.missing_value

        return {
            column: AdjustedArray(
                # Pull out requested columns/rows from our baseline data.
                data=data,
                adjustments=self.format_adjustments(dates, sids),
                missing_value=column.missing_value,
            ),
        }
Exemple #15
0
    def load_adjusted_array(self, domain, columns, dates, sids, mask):
        date_indexer = np.searchsorted(self.dates, dates)
        assets_indexer = np.searchsorted(self.assets, sids)

        # Boolean arrays with True on matched entries
        good_dates = (date_indexer != -1)
        good_assets = (assets_indexer != -1)
        mask = (good_assets & as_column(good_dates)) & mask
        out = {}
        with pd.HDFStore(self.data_path) as store:
            for column in columns:
                try:
                    data = store["/data/" + column.name].values
                    data = data[np.ix_(date_indexer, assets_indexer)]
                    data[~mask] = column.missing_value
                except KeyError:
                    raise ValueError("Couldn't find loader for %s" %
                                     column.name)
                out[column] = AdjustedArray(
                    # Pull out requested columns/rows from our baseline data.
                    data=data,
                    adjustments={},
                    missing_value=column.missing_value,
                )
        return out
Exemple #16
0
    def lifetimes(self, dates, include_start_date, country_codes):
        """
        Compute a DataFrame representing asset lifetimes for the specified date
        range.

        Parameters
        ----------
        dates : pd.DatetimeIndex
            The dates for which to compute lifetimes.
        include_start_date : bool
            Whether or not to count the asset as alive on its start_date.

            This is useful in a backtesting context where `lifetimes` is being
            used to signify "do I have data for this asset as of the morning of
            this date?"  For many financial metrics, (e.g. daily close), data
            isn't available for an asset until the end of the asset's first
            day.
        country_codes : iterable[str]
            The country codes to get lifetimes for.

        Returns
        -------
        lifetimes : pd.DataFrame
            A frame of dtype bool with `dates` as index and an Int64Index of
            assets as columns.  The value at `lifetimes.loc[date, asset]` will
            be True iff `asset` existed on `date`.  If `include_start_date` is
            False, then lifetimes.loc[date, asset] will be false when date ==
            asset.start_date.

        See Also
        --------
        numpy.putmask
        zipline.pipeline.engine.SimplePipelineEngine._compute_root_mask
        """
        if isinstance(country_codes, string_types):
            raise TypeError(
                "Got string {!r} instead of an iterable of strings in "
                "AssetFinder.lifetimes.".format(country_codes), )

        # normalize to a cache-key so that we can memoize results.
        country_codes = frozenset(country_codes)

        lifetimes = self._asset_lifetimes.get(country_codes)
        if lifetimes is None:
            self._asset_lifetimes[country_codes] = lifetimes = (
                self._compute_asset_lifetimes(country_codes))

        raw_dates = as_column(dates.asi8)
        if include_start_date:
            mask = lifetimes.start <= raw_dates
        else:
            mask = lifetimes.start < raw_dates
        mask &= (raw_dates <= lifetimes.end)

        return pd.DataFrame(mask, index=dates, columns=lifetimes.sid)
    def test_nan_handling_matches_empyrical(self, seed, pct_dependent,
                                            pct_independent):
        rand = np.random.RandomState(seed)

        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5]) * 10
        independent = as_column(np.linspace(-5., 10., 50))
        noise = as_column(rand.uniform(-.1, .1, 50))
        dependents = 1.0 + true_betas * independent + noise

        # Fill 20% of the input arrays with nans randomly.
        dependents[rand.uniform(0, 1, dependents.shape) < pct_dependent] = nan
        independent[independent > np.nanmean(independent)] = nan

        # Sanity check that we actually inserted some nans.
        # self.assertTrue(np.count_nonzero(np.isnan(dependents)) > 0)
        self.assertTrue(np.count_nonzero(np.isnan(independent)) > 0)

        result = self.compare_with_empyrical(dependents, independent)

        # compare_with_empyrical uses requred_observations=0, so we shouldn't
        # have any nans in the output even though we had some in the input.
        self.assertTrue(not np.isnan(result).any())
    def test_broadcasting(self):
        _independent = as_column(np.array([1, 2, 3, 4, 5]))
        dependent = _independent * [2.5, 1.0, -3.5]

        def do_check(independent):
            result = vectorized_pearson_r(dependent,
                                          independent,
                                          allowed_missing=0)
            assert_equal(result, np.array([1.0, 1.0, -1.0]))

        # We should get the same result from passing a N x 1 array or an N x 3
        # array with the column tiled 3 times.
        do_check(_independent)
        do_check(np.tile(_independent, 3))
    def test_nan_handling_matches_empyrical(self,
                                            seed,
                                            pct_dependent,
                                            pct_independent):
        rand = np.random.RandomState(seed)

        true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5]) * 10
        independent = as_column(np.linspace(-5., 10., 50))
        noise = as_column(rand.uniform(-.1, .1, 50))
        dependents = 1.0 + true_betas * independent + noise

        # Fill 20% of the input arrays with nans randomly.
        dependents[rand.uniform(0, 1, dependents.shape) < pct_dependent] = nan
        independent[independent > np.nanmean(independent)] = nan

        # Sanity check that we actually inserted some nans.
        # self.assertTrue(np.count_nonzero(np.isnan(dependents)) > 0)
        self.assertTrue(np.count_nonzero(np.isnan(independent)) > 0)

        result = self.compare_with_empyrical(dependents, independent)

        # compare_with_empyrical uses requred_observations=0, so we shouldn't
        # have any nans in the output even though we had some in the input.
        self.assertTrue(not np.isnan(result).any())
Exemple #20
0
    def lifetimes(self, dates, include_start_date):
        """
        Compute a DataFrame representing asset lifetimes for the specified date
        range.

        Parameters
        ----------
        dates : pd.DatetimeIndex
            The dates for which to compute lifetimes.
        include_start_date : bool
            Whether or not to count the asset as alive on its start_date.

            This is useful in a backtesting context where `lifetimes` is being
            used to signify "do I have data for this asset as of the morning of
            this date?"  For many financial metrics, (e.g. daily close), data
            isn't available for an asset until the end of the asset's first
            day.

        Returns
        -------
        lifetimes : pd.DataFrame
            A frame of dtype bool with `dates` as index and an Int64Index of
            assets as columns.  The value at `lifetimes.loc[date, asset]` will
            be True iff `asset` existed on `date`.  If `include_start_date` is
            False, then lifetimes.loc[date, asset] will be false when date ==
            asset.start_date.

        See Also
        --------
        numpy.putmask
        zipline.pipeline.engine.SimplePipelineEngine._compute_root_mask
        """
        # This is a less than ideal place to do this, because if someone adds
        # assets to the finder after we've touched lifetimes we won't have
        # those new assets available.  Mutability is not my favorite
        # programming feature.
        if self._asset_lifetimes is None:
            self._asset_lifetimes = self._compute_asset_lifetimes()
        lifetimes = self._asset_lifetimes

        raw_dates = as_column(dates.asi8)
        if include_start_date:
            mask = lifetimes.start <= raw_dates
        else:
            mask = lifetimes.start < raw_dates
        mask &= (raw_dates <= lifetimes.end)

        return pd.DataFrame(mask, index=dates, columns=lifetimes.sid)
Exemple #21
0
    def lifetimes(self, dates, include_start_date):
        """
        Compute a DataFrame representing asset lifetimes for the specified date
        range.

        Parameters
        ----------
        dates : pd.DatetimeIndex
            The dates for which to compute lifetimes.
        include_start_date : bool
            Whether or not to count the asset as alive on its start_date.

            This is useful in a backtesting context where `lifetimes` is being
            used to signify "do I have data for this asset as of the morning of
            this date?"  For many financial metrics, (e.g. daily close), data
            isn't available for an asset until the end of the asset's first
            day.

        Returns
        -------
        lifetimes : pd.DataFrame
            A frame of dtype bool with `dates` as index and an Int64Index of
            assets as columns.  The value at `lifetimes.loc[date, asset]` will
            be True iff `asset` existed on `date`.  If `include_start_date` is
            False, then lifetimes.loc[date, asset] will be false when date ==
            asset.start_date.

        See Also
        --------
        numpy.putmask
        zipline.pipeline.engine.SimplePipelineEngine._compute_root_mask
        """
        # This is a less than ideal place to do this, because if someone adds
        # assets to the finder after we've touched lifetimes we won't have
        # those new assets available.  Mutability is not my favorite
        # programming feature.
        if self._asset_lifetimes is None:
            self._asset_lifetimes = self._compute_asset_lifetimes()
        lifetimes = self._asset_lifetimes

        raw_dates = as_column(dates.asi8)
        if include_start_date:
            mask = lifetimes.start <= raw_dates
        else:
            mask = lifetimes.start < raw_dates
        mask &= (raw_dates <= lifetimes.end)

        return pd.DataFrame(mask, index=dates, columns=lifetimes.sid)
Exemple #22
0
    def run_pipeline(self, pipeline):
        now = pd.Timestamp.now(tz=self._calendar.tz)
        today = pd.Timestamp(year=now.year,
                             month=now.month,
                             day=now.day,
                             tz='utc')

        end_date = self._calendar[self._calendar.get_loc(today,
                                                         method='ffill')]
        start_date = end_date
        screen_name = uuid4().hex
        graph = pipeline.to_execution_plan(
            screen_name,
            self._root_mask_term,
            self._calendar,
            start_date,
            end_date,
        )
        extra_rows = graph.extra_rows[self._root_mask_term]
        root_mask = self._compute_root_mask(start_date, end_date, extra_rows)
        dates, assets, root_mask_values = explode(root_mask)

        initial_workspace = self._populate_initial_workspace(
            {
                self._root_mask_term: root_mask_values,
                self._root_mask_dates_term: as_column(dates.values)
            },
            self._root_mask_term,
            graph,
            dates,
            assets,
        )

        results = self.compute_chunk(
            graph,
            dates,
            assets,
            initial_workspace,
        )

        return self._to_narrow(
            graph.outputs,
            results,
            results.pop(screen_name),
            dates[extra_rows:],
            assets,
        )
Exemple #23
0
    def run_pipeline(self, pipeline, start_date, end_date):
        """
        Compute a pipeline.

        The algorithm implemented here can be broken down into the following
        stages:

        0. Build a dependency graph of all terms in `pipeline`.  Topologically
           sort the graph to determine an order in which we can compute the
           terms.

        1. Ask our AssetFinder for a "lifetimes matrix", which should contain,
           for each date between start_date and end_date, a boolean value for
           each known asset indicating whether the asset existed on that date.

        2. Compute each term in the dependency order determined in (0), caching
           the results in a a dictionary to that they can be fed into future
           terms.

        3. For each date, determine the number of assets passing
           pipeline.screen.  The sum, N, of all these values is the total
           number of rows in our output frame, so we pre-allocate an output
           array of length N for each factor in `terms`.

        4. Fill in the arrays allocated in (3) by copying computed values from
           our output cache into the corresponding rows.

        5. Stick the values computed in (4) into a DataFrame and return it.

        Step 0 is performed by ``Pipeline.to_graph``.
        Step 1 is performed in ``SimplePipelineEngine._compute_root_mask``.
        Step 2 is performed in ``SimplePipelineEngine.compute_chunk``.
        Steps 3, 4, and 5 are performed in ``SimplePiplineEngine._to_narrow``.

        Parameters
        ----------
        pipeline : zipline.pipeline.Pipeline
            The pipeline to run.
        start_date : pd.Timestamp
            Start date of the computed matrix.
        end_date : pd.Timestamp
            End date of the computed matrix.

        Returns
        -------
        result : pd.DataFrame
            A frame of computed results.

            The ``result`` columns correspond to the entries of
            `pipeline.columns`, which should be a dictionary mapping strings to
            instances of :class:`zipline.pipeline.term.Term`.

            For each date between ``start_date`` and ``end_date``, ``result``
            will contain a row for each asset that passed `pipeline.screen`.
            A screen of ``None`` indicates that a row should be returned for
            each asset that existed each day.

        See Also
        --------
        :meth:`zipline.pipeline.engine.PipelineEngine.run_pipeline`
        :meth:`zipline.pipeline.engine.PipelineEngine.run_chunked_pipeline`
        """
        if end_date < start_date:
            raise ValueError(
                "start_date must be before or equal to end_date \n"
                "start_date=%s, end_date=%s" % (start_date, end_date)
            )

        screen_name = uuid4().hex
        graph = pipeline.to_execution_plan(
            screen_name,
            self._root_mask_term,
            self._calendar,
            start_date,
            end_date,
        )
        extra_rows = graph.extra_rows[self._root_mask_term]
        root_mask = self._compute_root_mask(start_date, end_date, extra_rows)
        dates, assets, root_mask_values = explode(root_mask)

        initial_workspace = self._populate_initial_workspace(
            {
                self._root_mask_term: root_mask_values,
                self._root_mask_dates_term: as_column(dates.values)
            },
            self._root_mask_term,
            graph,
            dates,
            assets,
        )

        results = self.compute_chunk(
            graph,
            dates,
            assets,
            initial_workspace,
        )

        return self._to_narrow(
            graph.outputs,
            results,
            results.pop(screen_name),
            dates[extra_rows:],
            assets,
        )
Exemple #24
0
    def run_pipeline(self, pipeline, start_date, end_date):
        """
        Compute a pipeline.

        Parameters
        ----------
        pipeline : zipline.pipeline.Pipeline
            The pipeline to run.
        start_date : pd.Timestamp
            Start date of the computed matrix.
        end_date : pd.Timestamp
            End date of the computed matrix.

        Returns
        -------
        result : pd.DataFrame
            A frame of computed results.

            The ``result`` columns correspond to the entries of
            `pipeline.columns`, which should be a dictionary mapping strings to
            instances of :class:`zipline.pipeline.term.Term`.

            For each date between ``start_date`` and ``end_date``, ``result``
            will contain a row for each asset that passed `pipeline.screen`.
            A screen of ``None`` indicates that a row should be returned for
            each asset that existed each day.

        See Also
        --------
        :meth:`zipline.pipeline.engine.PipelineEngine.run_pipeline`
        :meth:`zipline.pipeline.engine.PipelineEngine.run_chunked_pipeline`
        """
        # See notes at the top of this module for a description of the
        # algorithm implemented here.
        if end_date < start_date:
            raise ValueError(
                "start_date must be before or equal to end_date \n"
                "start_date=%s, end_date=%s" % (start_date, end_date)

            )

        domain = self.resolve_domain(pipeline)

        graph = pipeline.to_execution_plan(
            domain, self._root_mask_term, start_date, end_date,
        )
        extra_rows = graph.extra_rows[self._root_mask_term]
        root_mask = self._compute_root_mask(
            domain, start_date, end_date, extra_rows,
        )
        dates, assets, root_mask_values = explode(root_mask)

        initial_workspace = self._populate_initial_workspace(
            {
                self._root_mask_term: root_mask_values,
                self._root_mask_dates_term: as_column(dates.values)
            },
            self._root_mask_term,
            graph,
            dates,
            assets,
        )

        results = self.compute_chunk(graph, dates, assets, initial_workspace)

        return self._to_narrow(
            graph.outputs,
            results,
            results.pop(graph.screen_name),
            dates[extra_rows:],
            assets,
        )
Exemple #25
0
    def run_pipeline(self, pipeline, start_date, end_date):
        """
        Compute a pipeline.

        Parameters
        ----------
        pipeline : zipline.pipeline.Pipeline
            The pipeline to run.
        start_date : pd.Timestamp
            Start date of the computed matrix.
        end_date : pd.Timestamp
            End date of the computed matrix.

        Returns
        -------
        result : pd.DataFrame
            A frame of computed results.

            The ``result`` columns correspond to the entries of
            `pipeline.columns`, which should be a dictionary mapping strings to
            instances of :class:`zipline.pipeline.term.Term`.

            For each date between ``start_date`` and ``end_date``, ``result``
            will contain a row for each asset that passed `pipeline.screen`.
            A screen of ``None`` indicates that a row should be returned for
            each asset that existed each day.

        See Also
        --------
        :meth:`zipline.pipeline.engine.PipelineEngine.run_pipeline`
        :meth:`zipline.pipeline.engine.PipelineEngine.run_chunked_pipeline`
        """
        # See notes at the top of this module for a description of the
        # algorithm implemented here.
        if end_date < start_date:
            raise ValueError(
                "start_date must be before or equal to end_date \n"
                "start_date=%s, end_date=%s" % (start_date, end_date)

            )

        domain = self._resolve_domain(pipeline)

        graph = pipeline.to_execution_plan(
            domain, self._root_mask_term, start_date, end_date,
        )
        extra_rows = graph.extra_rows[self._root_mask_term]
        root_mask = self._compute_root_mask(
            domain, start_date, end_date, extra_rows,
        )
        dates, assets, root_mask_values = explode(root_mask)

        initial_workspace = self._populate_initial_workspace(
            {
                self._root_mask_term: root_mask_values,
                self._root_mask_dates_term: as_column(dates.values)
            },
            self._root_mask_term,
            graph,
            dates,
            assets,
        )

        results = self.compute_chunk(graph, dates, assets, initial_workspace)

        return self._to_narrow(
            graph.outputs,
            results,
            results.pop(graph.screen_name),
            dates[extra_rows:],
            assets,
        )