Exemple #1
0
    def _append_to_window(self, event):
        self.field_names = self._get_field_names(event)

        if self.static_sids is None:
            sids = set(event.data.keys())
        else:
            sids = self.static_sids

        # Create rolling panel if not existant
        if self.rolling_panel is None:
            self.rolling_panel = RollingPanel(
                self.window_length * self.bars_in_day, self.field_names, sids)

        # Store event in rolling frame
        self.rolling_panel.add_frame(
            event.dt,
            pd.DataFrame(event.data, index=self.field_names, columns=sids))

        # update trading day counters
        _, mkt_close = trading.environment.get_open_and_close(event.dt)
        if self.bars == 'daily':
            # Daily bars have their dt set to midnight.
            mkt_close = mkt_close.replace(hour=0, minute=0, second=0)
        if event.dt >= mkt_close:
            self.trading_days_total += 1

        self.last_dt = event.dt

        if self.trading_days_total >= self.window_length:
            self.full = True
    def test_basics(self):
        items = ['foo', 'bar', 'baz']
        minor = ['A', 'B', 'C', 'D']

        window = 10

        rp = RollingPanel(window, items, minor, cap_multiple=2)

        dates = pd.date_range('2000-01-01', periods=30, tz='utc')

        major_deque = deque()

        frames = {}

        for i in range(30):
            frame = pd.DataFrame(np.random.randn(3, 4),
                                 index=items,
                                 columns=minor)
            date = dates[i]

            rp.add_frame(date, frame)

            frames[date] = frame
            major_deque.append(date)

            if i >= window:
                major_deque.popleft()

            result = rp.get_current()
            expected = pd.Panel(frames,
                                items=list(major_deque),
                                major_axis=items,
                                minor_axis=minor)
            tm.assert_panel_equal(result, expected.swapaxes(0, 1))
    def test_basics(self, window=10):
        items = ['bar', 'baz', 'foo']
        minor = ['A', 'B', 'C', 'D']

        rp = RollingPanel(window, items, minor, cap_multiple=2)

        dates = pd.date_range('2000-01-01', periods=30, tz='utc')

        major_deque = deque(maxlen=window)

        frames = {}

        for i, date in enumerate(dates):
            frame = pd.DataFrame(np.random.randn(3, 4), index=items,
                                 columns=minor)

            rp.add_frame(date, frame)

            frames[date] = frame
            major_deque.append(date)

            result = rp.get_current()
            expected = pd.Panel(frames, items=list(major_deque),
                                major_axis=items, minor_axis=minor)

            tm.assert_panel_equal(result, expected.swapaxes(0, 1))
def run_history_implementations(option='clever', n=500, change_fields=False,
                                copy=False, n_items=15, n_minor=20,
                                change_freq=5, window=100):
    items = range(n_items)
    minor = range(n_minor)
    periods = n

    dates = pd.date_range('2000-01-01', periods=periods, tz='utc')
    frames = {}

    if option == 'clever':
        rp = RollingPanel(window, items, minor, cap_multiple=2)
        major_deque = deque()

        for i in range(periods):
            # Add a new and drop an field every change_freq iterations
            if change_fields and (i % change_freq) == 0:
                minor = minor[1:]
                minor.append(minor[-1] + 1)
                items = items[1:]
                items.append(items[-1] + 1)

            dummy = pd.DataFrame(np.random.randn(len(items), len(minor)),
                                 index=items, columns=minor)

            frame = dummy * (1 + 0.001 * i)
            date = dates[i]

            rp.add_frame(date, frame)

            frames[date] = frame
            major_deque.append(date)

            if i >= window:
                del frames[major_deque.popleft()]

            result = rp.get_current()
            if copy:
                result = result.copy()
    else:
        major_deque = deque()
        dummy = pd.DataFrame(np.random.randn(len(items), len(minor)),
                             index=items, columns=minor)

        for i in range(periods):
            frame = dummy * (1 + 0.001 * i)
            date = dates[i]
            frames[date] = frame
            major_deque.append(date)

            if i >= window:
                del frames[major_deque.popleft()]

            result = pd.Panel(frames, items=list(major_deque),
                              major_axis=items, minor_axis=minor)
Exemple #5
0
    def _init_panels(self, sids):
        if self.downsample:
            self.rolling_panel = RollingPanel(self.bars_in_day,
                                              self.field_names, sids)

            self.daily_rolling_panel = RollingPanel(self.window_length,
                                                    self.field_names, sids)
        else:
            self.rolling_panel = RollingPanel(self.window_length *
                                              self.bars_in_day,
                                              self.field_names, sids)
def create_initial_day_panel(days_needed, fields, sids, dt):
    index = days_index_at_dt(days_needed, dt)
    # Use original index in case of 1 bar.
    if days_needed != 1:
        index = index[:-1]
    window = len(index)
    rp = RollingPanel(window, fields, sids)
    for i, day in enumerate(index):
        rp.index_buf[i] = day
    rp.pos = window
    return rp
def f(option='clever', n=500, copy=False):
    items = range(5)
    minor = range(20)
    window = 100
    periods = n

    dates = pd.date_range('2000-01-01', periods=periods, tz='utc')
    frames = {}

    if option == 'clever':
        rp = RollingPanel(window, items, minor, cap_multiple=2)
        major_deque = deque()
        dummy = pd.DataFrame(np.random.randn(len(items), len(minor)),
                             index=items,
                             columns=minor)

        for i in range(periods):
            frame = dummy * (1 + 0.001 * i)
            date = dates[i]

            rp.add_frame(date, frame)

            frames[date] = frame
            major_deque.append(date)

            if i >= window:
                del frames[major_deque.popleft()]

            result = rp.get_current()
            if copy:
                result = result.copy()
    else:
        major_deque = deque()
        dummy = pd.DataFrame(np.random.randn(len(items), len(minor)),
                             index=items,
                             columns=minor)

        for i in range(periods):
            frame = dummy * (1 + 0.001 * i)
            date = dates[i]
            frames[date] = frame
            major_deque.append(date)

            if i >= window:
                del frames[major_deque.popleft()]

            result = pd.Panel(frames,
                              items=list(major_deque),
                              major_axis=items,
                              minor_axis=minor)
    def _append_to_window(self, event):
        self.field_names = self._get_field_names(event)

        if self.static_sids is None:
            sids = set(event.data.keys())
        else:
            sids = self.static_sids

        # Create rolling panel if not existant
        if self.rolling_panel is None:
            self.rolling_panel = RollingPanel(self.window_length *
                                              self.bars_in_day,
                                              self.field_names, sids)

        # Store event in rolling frame
        self.rolling_panel.add_frame(event.dt,
                                     pd.DataFrame(event.data,
                                                  index=self.field_names,
                                                  columns=sids))

        # update trading day counters
        _, mkt_close = trading.environment.get_open_and_close(event.dt)
        if self.bars == 'daily':
            # Daily bars have their dt set to midnight.
            mkt_close = mkt_close.replace(hour=0, minute=0, second=0)
        if event.dt >= mkt_close:
            self.trading_days_total += 1

        self.last_dt = event.dt

        if self.trading_days_total >= self.window_length:
            self.full = True
Exemple #9
0
    def _append_to_window(self, event):
        self.field_names = self._get_field_names(event)

        if self.static_sids is None:
            sids = set(event.data.keys())
        else:
            sids = self.static_sids

        # Create rolling panel if not existant
        if self.rolling_panel is None:
            self.rolling_panel = RollingPanel(self.window_length,
                                              self.field_names, sids)

        # Store event in rolling frame
        self.rolling_panel.add_frame(event.dt,
                                     pd.DataFrame(event.data,
                                                  index=self.field_names,
                                                  columns=sids))

        # update trading day counters
        if self.last_dt.day != event.dt.day:
            self.last_dt = event.dt
            self.trading_days_total += 1

        if self.trading_days_total >= self.window_length:
            self.full = True
Exemple #10
0
def f(option='clever', n=500, copy=False):
    items = range(5)
    minor = range(20)
    window = 100
    periods = n

    dates = pd.date_range('2000-01-01', periods=periods, tz='utc')
    frames = {}

    if option == 'clever':
        rp = RollingPanel(window, items, minor, cap_multiple=2)
        major_deque = deque()
        dummy = pd.DataFrame(np.random.randn(len(items), len(minor)),
                             index=items, columns=minor)

        for i in range(periods):
            frame = dummy * (1 + 0.001 * i)
            date = dates[i]

            rp.add_frame(date, frame)

            frames[date] = frame
            major_deque.append(date)

            if i >= window:
                del frames[major_deque.popleft()]

            result = rp.get_current()
            if copy:
                result = result.copy()
    else:
        major_deque = deque()
        dummy = pd.DataFrame(np.random.randn(len(items), len(minor)),
                             index=items, columns=minor)

        for i in range(periods):
            frame = dummy * (1 + 0.001 * i)
            date = dates[i]
            frames[date] = frame
            major_deque.append(date)

            if i >= window:
                del frames[major_deque.popleft()]

            result = pd.Panel(frames, items=list(major_deque),
                              major_axis=items, minor_axis=minor)
    def test_alignment(self, env):
        items = ('a', 'b')
        sids = (1, 2)

        dts = env.market_minute_window(
            env.open_and_closes.market_open[0],
            4,
        ).values
        rp = RollingPanel(2, items, sids, initial_dates=dts[1:-1])

        frame = pd.DataFrame(
            data=np.arange(4).reshape((2, 2)),
            columns=sids,
            index=items,
        )

        nan_arr = np.empty((2, 6))
        nan_arr.fill(np.nan)

        rp.add_frame(dts[-1], frame)

        cur = rp.get_current()
        data = np.array(
            (((np.nan, np.nan), (0, 1)), ((np.nan, np.nan), (2, 3))), float)
        expected = pd.Panel(
            data,
            major_axis=dts[2:],
            minor_axis=sids,
            items=items,
        )
        expected.major_axis = expected.major_axis.tz_localize('utc')
        tm.assert_panel_equal(
            cur,
            expected,
        )

        rp.extend_back(dts[:-2])

        cur = rp.get_current()
        data = np.array(
            (((np.nan, np.nan), (np.nan, np.nan), (np.nan, np.nan), (0, 1)),
             ((np.nan, np.nan), (np.nan, np.nan), (np.nan, np.nan), (2, 3))),
            float)
        expected = pd.Panel(
            data,
            major_axis=dts,
            minor_axis=sids,
            items=items,
        )
        expected.major_axis = expected.major_axis.tz_localize('utc')
        tm.assert_panel_equal(
            cur,
            expected,
        )
Exemple #12
0
 def create_buffer_panel(self, initial_dt):
     """
     Initialize a RollingPanel containing enough minutes to service all our
     frequencies.
     """
     max_bars_needed = max(freq.max_minutes
                           for freq in self.unique_frequencies)
     rp = RollingPanel(
         window=max_bars_needed,
         items=self.fields,
         sids=self.sids,
     )
     return rp
Exemple #13
0
    def create_digest_panels(self, initial_sids, initial_dt):
        """
        Initialize a RollingPanel for each unique panel frequency being stored
        by this container.  Each RollingPanel pre-allocates enough storage
        space to service the highest bar-count of any history call that it
        serves.

        Relies on the fact that group_by_frequency sorts the value lists by
        ascending bar count.
        """
        # Map from frequency -> first/last minute of the next digest to be
        # rolled for that frequency.
        first_window_starts = {}
        first_window_closes = {}

        # Map from frequency -> digest_panels.
        panels = {}
        for freq, specs in iteritems(self.frequency_groups):

            # Relying on the sorting of group_by_frequency to get the spec
            # requiring the largest number of bars.
            largest_spec = specs[-1]
            if largest_spec.bar_count == 1:

                # No need to allocate a digest panel; this frequency will only
                # ever use data drawn from self.buffer_panel.
                first_window_starts[freq] = freq.window_open(initial_dt)
                first_window_closes[freq] = freq.window_close(
                    first_window_starts[freq]
                )

                continue

            initial_dates = index_at_dt(largest_spec, initial_dt)

            # Set up dates for our first digest roll, which is keyed to the
            # close of the first entry in our initial index.
            first_window_closes[freq] = initial_dates[0]
            first_window_starts[freq] = freq.window_open(initial_dates[0])

            rp = RollingPanel(
                window=len(initial_dates) - 1,
                items=self.fields,
                sids=initial_sids,
            )

            panels[freq] = rp

        return panels, first_window_starts, first_window_closes
Exemple #14
0
 def create_buffer_panel(self, initial_sids, initial_dt):
     """
     Initialize a RollingPanel containing enough minutes to service all our
     frequencies.
     """
     max_bars_needed = max(freq.max_minutes
                           for freq in self.unique_frequencies)
     rp = RollingPanel(
         max_bars_needed,
         self.fields,
         initial_sids,
         # Restrict the initial data down to just the fields being used in
         # this container.
     )
     return rp
Exemple #15
0
    def _init_panels(self, sids):
        if self.downsample:
            self.rolling_panel = RollingPanel(self.bars_in_day,
                                              self.field_names, sids)

            self.daily_rolling_panel = RollingPanel(self.window_length,
                                                    self.field_names, sids)
        else:
            self.rolling_panel = RollingPanel(
                self.window_length * self.bars_in_day, self.field_names, sids)
    def test_get_current_multiple_call_same_tick(self, env):
        """
        In old get_current, each call the get_current would copy the data. Thus
        changing that object would have no side effects.

        To keep the same api, make sure that the raw option returns a copy too.
        """
        def data_id(values):
            return values.__array_interface__['data']

        items = ('a', 'b')
        sids = (1, 2)

        dts = env.market_minute_window(
            env.open_and_closes.market_open[0],
            4,
        ).values
        rp = RollingPanel(2, items, sids, initial_dates=dts[1:-1])

        frame = pd.DataFrame(
            data=np.arange(4).reshape((2, 2)),
            columns=sids,
            index=items,
        )

        nan_arr = np.empty((2, 6))
        nan_arr.fill(np.nan)

        rp.add_frame(dts[-1], frame)

        # each get_current call makea a copy
        cur = rp.get_current()
        cur2 = rp.get_current()
        assert data_id(cur.values) != data_id(cur2.values)

        # make sure raw follow same logic
        raw = rp.get_current(raw=True)
        raw2 = rp.get_current(raw=True)
        assert data_id(raw) != data_id(raw2)
Exemple #17
0
    def _create_panel(self, dt, spec):
        """
        Constructs a rolling panel with a properly aligned date_buf.
        """
        dt = normalize_to_data_freq(spec.frequency.data_frequency, dt)

        window = spec.bar_count - 1

        date_buf = self._create_window_date_buf(
            window,
            spec.frequency.unit_str,
            spec.frequency.data_frequency,
            dt,
        )
        panel = RollingPanel(
            window=window,
            items=self.fields,
            sids=self.sids,
            initial_dates=date_buf,
        )

        return panel
    def test_get_current_multiple_call_same_tick(self):
        """
        In old get_current, each call the get_current would copy the data. Thus
        changing that object would have no side effects.

        To keep the same api, make sure that the raw option returns a copy too.
        """
        def data_id(values):
            return values.__array_interface__['data']

        items = ('a', 'b')
        sids = (1, 2)

        dts = self.env.market_minute_window(
            self.env.open_and_closes.market_open[0], 4,
        ).values
        rp = RollingPanel(2, items, sids, initial_dates=dts[1:-1])

        frame = pd.DataFrame(
            data=np.arange(4).reshape((2, 2)),
            columns=sids,
            index=items,
        )

        nan_arr = np.empty((2, 6))
        nan_arr.fill(np.nan)

        rp.add_frame(dts[-1], frame)

        # each get_current call makea a copy
        cur = rp.get_current()
        cur2 = rp.get_current()
        assert data_id(cur.values) != data_id(cur2.values)

        # make sure raw follow same logic
        raw = rp.get_current(raw=True)
        raw2 = rp.get_current(raw=True)
        assert data_id(raw) != data_id(raw2)
    def test_adding_and_dropping_items(self, n_items=5, n_minor=10, window=10,
                                       periods=30):
        np.random.seed(123)

        items = deque(range(n_items))
        minor = deque(range(n_minor))

        expected_items = deque(range(n_items))
        expected_minor = deque(range(n_minor))

        first_non_existant = max(n_items, n_minor) + 1
        # We want to add new columns with random order
        add_items = np.arange(first_non_existant, first_non_existant + periods)
        np.random.shuffle(add_items)

        rp = RollingPanel(window, items, minor, cap_multiple=2)

        dates = pd.date_range('2000-01-01', periods=periods, tz='utc')

        frames = {}

        expected_frames = deque(maxlen=window)
        expected_dates = deque()

        for i, (date, add_item) in enumerate(zip(dates, add_items)):
            frame = pd.DataFrame(np.random.randn(n_items, n_minor),
                                 index=items, columns=minor)

            if i >= window:
                # Old labels and dates should start to get dropped at every
                # call
                del frames[expected_dates.popleft()]
                expected_minor.popleft()
                expected_items.popleft()

            expected_frames.append(frame)
            expected_dates.append(date)

            rp.add_frame(date, frame)

            frames[date] = frame

            result = rp.get_current()
            np.testing.assert_array_equal(sorted(result.minor_axis.values),
                                          sorted(expected_minor))
            np.testing.assert_array_equal(sorted(result.items.values),
                                          sorted(expected_items))
            tm.assert_frame_equal(frame.T,
                                  result.ix[frame.index, -1, frame.columns])
            expected_result = pd.Panel(frames).swapaxes(0, 1)
            tm.assert_panel_equal(expected_result,
                                  result)

            # Insert new items
            minor.popleft()
            minor.append(add_item)
            items.popleft()
            items.append(add_item)

            expected_minor.append(add_item)
            expected_items.append(add_item)
Exemple #20
0
def run_history_implementations(option='clever',
                                n=500,
                                change_fields=False,
                                copy=False,
                                n_items=15,
                                n_minor=20,
                                change_freq=5,
                                window=100):
    items = range(n_items)
    minor = range(n_minor)
    periods = n

    dates = pd.date_range('2000-01-01', periods=periods, tz='utc')
    frames = {}

    if option == 'clever':
        rp = RollingPanel(window, items, minor, cap_multiple=2)
        major_deque = deque()

        for i in range(periods):
            # Add a new and drop an field every change_freq iterations
            if change_fields and (i % change_freq) == 0:
                minor = minor[1:]
                minor.append(minor[-1] + 1)
                items = items[1:]
                items.append(items[-1] + 1)

            dummy = pd.DataFrame(np.random.randn(len(items), len(minor)),
                                 index=items,
                                 columns=minor)

            frame = dummy * (1 + 0.001 * i)
            date = dates[i]

            rp.add_frame(date, frame)

            frames[date] = frame
            major_deque.append(date)

            if i >= window:
                del frames[major_deque.popleft()]

            result = rp.get_current()
            if copy:
                result = result.copy()
    else:
        major_deque = deque()
        dummy = pd.DataFrame(np.random.randn(len(items), len(minor)),
                             index=items,
                             columns=minor)

        for i in range(periods):
            frame = dummy * (1 + 0.001 * i)
            date = dates[i]
            frames[date] = frame
            major_deque.append(date)

            if i >= window:
                del frames[major_deque.popleft()]

            result = pd.Panel(frames,
                              items=list(major_deque),
                              major_axis=items,
                              minor_axis=minor)
Exemple #21
0
    def test_adding_and_dropping_items(self,
                                       n_items=5,
                                       n_minor=10,
                                       window=10,
                                       periods=30):
        np.random.seed(123)

        items = deque(range(n_items))
        minor = deque(range(n_minor))

        expected_items = deque(range(n_items))
        expected_minor = deque(range(n_minor))

        first_non_existant = max(n_items, n_minor) + 1
        # We want to add new columns with random order
        add_items = np.arange(first_non_existant, first_non_existant + periods)
        np.random.shuffle(add_items)

        rp = RollingPanel(window, items, minor, cap_multiple=2)

        dates = pd.date_range('2000-01-01', periods=periods, tz='utc')

        frames = {}

        expected_frames = deque(maxlen=window)
        expected_dates = deque()

        for i, (date, add_item) in enumerate(zip(dates, add_items)):
            frame = pd.DataFrame(np.random.randn(n_items, n_minor),
                                 index=items,
                                 columns=minor)

            if i >= window:
                # Old labels and dates should start to get dropped at every
                # call
                del frames[expected_dates.popleft()]
                expected_minor.popleft()
                expected_items.popleft()

            expected_frames.append(frame)
            expected_dates.append(date)

            rp.add_frame(date, frame)

            frames[date] = frame

            result = rp.get_current()
            np.testing.assert_array_equal(sorted(result.minor_axis.values),
                                          sorted(expected_minor))
            np.testing.assert_array_equal(sorted(result.items.values),
                                          sorted(expected_items))
            tm.assert_frame_equal(frame.T, result.ix[frame.index, -1,
                                                     frame.columns])
            expected_result = pd.Panel(frames).swapaxes(0, 1)
            tm.assert_panel_equal(expected_result, result)

            # Insert new items
            minor.popleft()
            minor.append(add_item)
            items.popleft()
            items.append(add_item)

            expected_minor.append(add_item)
            expected_items.append(add_item)
Exemple #22
0
class BatchTransform(object):
    """Base class for batch transforms with a trailing window of
    variable length. As opposed to pure EventWindows that get a stream
    of events and are bound to a single SID, this class creates stream
    of pandas DataFrames with each colum representing a sid.

    There are two ways to create a new batch window:
    (i) Inherit from BatchTransform and overload get_value(data).
        E.g.:
        ```
        class MyBatchTransform(BatchTransform):
            def get_value(self, data):
               # compute difference between the means of sid 0 and sid 1
               return data[0].mean() - data[1].mean()
        ```

    (ii) Use the batch_transform decorator.
        E.g.:
        ```
        @batch_transform
        def my_batch_transform(data):
            return data[0].mean() - data[1].mean()

        ```

    In your algorithm you would then have to instantiate
    this in the initialize() method:
    ```
    self.my_batch_transform = MyBatchTransform()
    ```

    To then use it, inside of the algorithm handle_data(), call the
    handle_data() of the BatchTransform and pass it the current event:
    ```
    result = self.my_batch_transform(data)
    ```

    """

    def __init__(self,
                 func=None,
                 refresh_period=0,
                 window_length=None,
                 clean_nans=True,
                 sids=None,
                 fields=None,
                 compute_only_full=True,
                 bars='daily',
                 downsample=False):

        """Instantiate new batch_transform object.

        :Arguments:
            func : python function <optional>
                If supplied will be called after each refresh_period
                with the data panel and all args and kwargs supplied
                to the handle_data() call.
            refresh_period : int
                Interval to wait between advances in the window.
            window_length : int
                How many days the trailing window should have.
            clean_nans : bool <default=True>
                Whether to (forward) fill in nans.
            sids : list <optional>
                Which sids to include in the moving window.  If not
                supplied sids will be extracted from incoming
                events.
            fields : list <optional>
                Which fields to include in the moving window
                (e.g. 'price'). If not supplied, fields will be
                extracted from incoming events.
            compute_only_full : bool <default=True>
                Only call the user-defined function once the window is
                full. Returns None if window is not full yet.
            downsample : bool <default=False>
                If true, downsample bars to daily bars. Otherwise, do nothing.
        """
        if func is not None:
            self.compute_transform_value = func
        else:
            self.compute_transform_value = self.get_value

        self.clean_nans = clean_nans
        self.compute_only_full = compute_only_full
        # no need to down sample if the bars are already daily
        self.downsample = downsample and (bars == 'minute')

        # How many bars are in a day
        self.bars = bars
        if self.bars == 'daily':
            self.bars_in_day = 1
        elif self.bars == 'minute':
            self.bars_in_day = int(6.5 * 60)
        else:
            raise ValueError('%s bars not understood.' % self.bars)

        # The following logic is to allow pre-specified sid filters
        # to operate on the data, but to also allow new symbols to
        # enter the batch transform's window IFF a sid filter is not
        # specified.
        if sids is not None:
            if isinstance(sids, (string_types, Integral)):
                self.static_sids = set([sids])
            else:
                self.static_sids = set(sids)
        else:
            self.static_sids = None

        self.initial_field_names = fields
        if isinstance(self.initial_field_names, string_types):
            self.initial_field_names = [self.initial_field_names]
        self.field_names = set()

        self.refresh_period = refresh_period

        check_window_length(window_length)
        self.window_length = window_length

        self.trading_days_total = 0
        self.window = None

        self.full = False
        # Set to -inf essentially to cause update on first attempt.
        self.last_dt = pd.Timestamp('1900-1-1', tz='UTC')

        self.updated = False
        self.cached = None
        self.last_args = None
        self.last_kwargs = None

        # Data panel that provides bar information to fill in the window,
        # when no bar ticks are available from the data source generator
        # Used in universes that 'rollover', e.g. one that has a different
        # set of stocks per quarter
        self.supplemental_data = None

        self.rolling_panel = None
        self.daily_rolling_panel = None

    def handle_data(self, data, *args, **kwargs):
        """
        Point of entry. Process an event frame.
        """
        # extract dates
        dts = [event.dt for event in itervalues(data._data)]
        # we have to provide the event with a dt. This is only for
        # checking if the event is outside the window or not so a
        # couple of seconds shouldn't matter. We don't add it to
        # the data parameter, because it would mix dt with the
        # sid keys.
        event = Event()
        event.dt = max(dts)
        event.data = {k: v.__dict__ for k, v in iteritems(data._data)
                      # Need to check if data has a 'length' to filter
                      # out sids without trade data available.
                      # TODO: expose more of 'no trade available'
                      # functionality to zipline
                      if len(v)}

        # only modify the trailing window if this is
        # a new event. This is intended to make handle_data
        # idempotent.
        if self.last_dt < event.dt:
            self.updated = True
            self._append_to_window(event)
        else:
            self.updated = False

        # return newly computed or cached value
        return self.get_transform_value(*args, **kwargs)

    def _init_panels(self, sids):
        if self.downsample:
            self.rolling_panel = RollingPanel(self.bars_in_day,
                                              self.field_names, sids)

            self.daily_rolling_panel = RollingPanel(self.window_length,
                                                    self.field_names, sids)
        else:
            self.rolling_panel = RollingPanel(self.window_length *
                                              self.bars_in_day,
                                              self.field_names, sids)

    def _append_to_window(self, event):
        self.field_names = self._get_field_names(event)

        if self.static_sids is None:
            sids = set(event.data.keys())
        else:
            sids = self.static_sids

        # the panel sent to the transform code will have
        # columns masked with this set of sids. This is how
        # we guarantee that all (and only) the sids sent to the
        # algorithm's handle_data and passed to the batch
        # transform. See the get_data method to see it applied.
        # N.B. that the underlying panel grows monotonically
        # if the set of sids changes over time.
        self.latest_sids = sids
        # Create rolling panel if not existant
        if self.rolling_panel is None:
            self._init_panels(sids)

        # Store event in rolling frame
        self.rolling_panel.add_frame(event.dt,
                                     pd.DataFrame(event.data,
                                                  index=self.field_names,
                                                  columns=sids))

        # update trading day counters
        # we may get events from non-trading sources which occurr on
        # non-trading days. The book-keeping for market close and
        # trading day counting should only consider trading days.
        if trading.environment.is_trading_day(event.dt):
            _, mkt_close = trading.environment.get_open_and_close(event.dt)
            if self.bars == 'daily':
                # Daily bars have their dt set to midnight.
                mkt_close = trading.environment.normalize_date(mkt_close)
            if event.dt == mkt_close:
                if self.downsample:
                    downsample_panel(self.rolling_panel,
                                     self.daily_rolling_panel,
                                     mkt_close
                                     )
                self.trading_days_total += 1
            self.mkt_close = mkt_close

        self.last_dt = event.dt

        if self.trading_days_total >= self.window_length:
            self.full = True

    def get_transform_value(self, *args, **kwargs):
        """Call user-defined batch-transform function passing all
        arguments.

        Note that this will only call the transform if the datapanel
        has actually been updated. Otherwise, the previously, cached
        value will be returned.
        """
        if self.compute_only_full and not self.full:
            return None

        #################################################
        # Determine whether we should call the transform
        # 0. Support historical/legacy usage of '0' signaling,
        #    'update on every bar'
        if self.refresh_period == 0:
            period_signals_update = True
        else:
            # 1. Is the refresh period over?
            period_signals_update = (
                self.trading_days_total % self.refresh_period == 0)
        # 2. Have the args or kwargs been changed since last time?
        args_updated = args != self.last_args or kwargs != self.last_kwargs
        # 3. Is this a downsampled batch, and is the last event mkt close?
        downsample_ready = not self.downsample or \
            self.last_dt == self.mkt_close

        recalculate_needed = downsample_ready and \
            (args_updated or (period_signals_update and self.updated))
        ###################################################

        if recalculate_needed:
            self.cached = self.compute_transform_value(
                self.get_data(),
                *args,
                **kwargs
            )

        self.last_args = args
        self.last_kwargs = kwargs
        return self.cached

    def get_data(self):
        """Create a pandas.Panel (i.e. 3d DataFrame) from the
        events in the current window.

        Returns:
        The resulting panel looks like this:
        index : field_name (e.g. price)
        major axis/rows : dt
        minor axis/colums : sid
        """
        if self.downsample:
            data = self.daily_rolling_panel.get_current()
        else:
            data = self.rolling_panel.get_current()

        if self.supplemental_data is not None:
            for item in data.items:
                if item not in self.supplemental_data.items:
                    continue
                for dt in data.major_axis:
                    try:
                        supplemental_for_dt = self.supplemental_data.ix[
                            item, dt, :]
                    except KeyError:
                        # Only filling in data available in supplemental data.
                        supplemental_for_dt = None

                    if supplemental_for_dt is not None:
                        data[item].ix[dt] = \
                            supplemental_for_dt.combine_first(
                                data[item].ix[dt])

        # screen out sids no longer in the multiverse
        data = data.ix[:, :, self.latest_sids]
        if self.clean_nans:
            # Fills in gaps of missing data during transform
            # of multiple stocks. E.g. we may be missing
            # minute data because of illiquidity of one stock
            data = data.fillna(method='ffill')

        # Hold on to a reference to the data,
        # so that it's easier to find the current data when stepping
        # through with a debugger
        self._curr_data = data

        return data

    def get_value(self, *args, **kwargs):
        raise NotImplementedError(
            "Either overwrite get_value or provide a func argument.")

    def __call__(self, f):
        self.compute_transform_value = f
        return self.handle_data

    def _extract_field_names(self, event):
        # extract field names from sids (price, volume etc), make sure
        # every sid has the same fields.
        sid_keys = []
        for sid in itervalues(event.data):
            keys = set([name for name, value in sid.items()
                        if isinstance(value,
                                      (int,
                                       float,
                                       numpy.integer,
                                       numpy.float,
                                       numpy.long))
                        ])
            sid_keys.append(keys)

        # with CUSTOM data events, there may be different fields
        # per sid. So the allowable keys are the union of all events.
        union = set.union(*sid_keys)
        unwanted_fields = set(['portfolio', 'sid', 'dt', 'type', 'source_id'])
        return union - unwanted_fields

    def _get_field_names(self, event):
        if self.initial_field_names is not None:
            return self.initial_field_names
        else:
            self.latest_names = self._extract_field_names(event)
            return set.union(self.field_names, self.latest_names)
    def test_alignment(self, env):
        items = ('a', 'b')
        sids = (1, 2)

        dts = env.market_minute_window(
            env.open_and_closes.market_open[0], 4,
        ).values
        rp = RollingPanel(2, items, sids, initial_dates=dts[1:-1])

        frame = pd.DataFrame(
            data=np.arange(4).reshape((2, 2)),
            columns=sids,
            index=items,
        )

        nan_arr = np.empty((2, 6))
        nan_arr.fill(np.nan)

        rp.add_frame(dts[-1], frame)

        cur = rp.get_current()
        data = np.array((((np.nan, np.nan),
                          (0, 1)),
                         ((np.nan, np.nan),
                          (2, 3))),
                        float)
        expected = pd.Panel(
            data,
            major_axis=dts[2:],
            minor_axis=sids,
            items=items,
        )
        expected.major_axis = expected.major_axis.tz_localize('utc')
        tm.assert_panel_equal(
            cur,
            expected,
        )

        rp.extend_back(dts[:-2])

        cur = rp.get_current()
        data = np.array((((np.nan, np.nan),
                          (np.nan, np.nan),
                          (np.nan, np.nan),
                          (0, 1)),
                         ((np.nan, np.nan),
                          (np.nan, np.nan),
                          (np.nan, np.nan),
                          (2, 3))),
                        float)
        expected = pd.Panel(
            data,
            major_axis=dts,
            minor_axis=sids,
            items=items,
        )
        expected.major_axis = expected.major_axis.tz_localize('utc')
        tm.assert_panel_equal(
            cur,
            expected,
        )
Exemple #24
0
class BatchTransform(object):
    """Base class for batch transforms with a trailing window of
    variable length. As opposed to pure EventWindows that get a stream
    of events and are bound to a single SID, this class creates stream
    of pandas DataFrames with each colum representing a sid.

    There are two ways to create a new batch window:
    (i) Inherit from BatchTransform and overload get_value(data).
        E.g.:
        ```
        class MyBatchTransform(BatchTransform):
            def get_value(self, data):
               # compute difference between the means of sid 0 and sid 1
               return data[0].mean() - data[1].mean()
        ```

    (ii) Use the batch_transform decorator.
        E.g.:
        ```
        @batch_transform
        def my_batch_transform(data):
            return data[0].mean() - data[1].mean()

        ```

    In your algorithm you would then have to instantiate
    this in the initialize() method:
    ```
    self.my_batch_transform = MyBatchTransform()
    ```

    To then use it, inside of the algorithm handle_data(), call the
    handle_data() of the BatchTransform and pass it the current event:
    ```
    result = self.my_batch_transform(data)
    ```

    """

    def __init__(self,
                 func=None,
                 refresh_period=0,
                 window_length=None,
                 clean_nans=True,
                 sids=None,
                 fields=None,
                 compute_only_full=True,
                 bars='daily',
                 downsample=False):

        """Instantiate new batch_transform object.

        :Arguments:
            func : python function <optional>
                If supplied will be called after each refresh_period
                with the data panel and all args and kwargs supplied
                to the handle_data() call.
            refresh_period : int
                Interval to wait between advances in the window.
            window_length : int
                How many days the trailing window should have.
            clean_nans : bool <default=True>
                Whether to (forward) fill in nans.
            sids : list <optional>
                Which sids to include in the moving window.  If not
                supplied sids will be extracted from incoming
                events.
            fields : list <optional>
                Which fields to include in the moving window
                (e.g. 'price'). If not supplied, fields will be
                extracted from incoming events.
            compute_only_full : bool <default=True>
                Only call the user-defined function once the window is
                full. Returns None if window is not full yet.
            downsample : bool <default=False>
                If true, downsample bars to daily bars. Otherwise, do nothing.
        """
        if func is not None:
            self.compute_transform_value = func
        else:
            self.compute_transform_value = self.get_value

        self.clean_nans = clean_nans
        self.compute_only_full = compute_only_full
        # no need to down sample if the bars are already daily
        self.downsample = downsample and (bars == 'minute')

        # How many bars are in a day
        self.bars = bars
        if self.bars == 'daily':
            self.bars_in_day = 1
        elif self.bars == 'minute':
            self.bars_in_day = int(6.5 * 60)
        else:
            raise ValueError('%s bars not understood.' % self.bars)

        # The following logic is to allow pre-specified sid filters
        # to operate on the data, but to also allow new symbols to
        # enter the batch transform's window IFF a sid filter is not
        # specified.
        if sids is not None:
            if isinstance(sids, (basestring, Integral)):
                self.static_sids = set([sids])
            else:
                self.static_sids = set(sids)
        else:
            self.static_sids = None

        self.initial_field_names = fields
        if isinstance(self.initial_field_names, basestring):
            self.initial_field_names = [self.initial_field_names]
        self.field_names = set()

        self.refresh_period = refresh_period

        check_window_length(window_length)
        self.window_length = window_length

        self.trading_days_total = 0
        self.window = None

        self.full = False
        # Set to -inf essentially to cause update on first attempt.
        self.last_dt = pd.Timestamp('1900-1-1', tz='UTC')

        self.updated = False
        self.cached = None
        self.last_args = None
        self.last_kwargs = None

        # Data panel that provides bar information to fill in the window,
        # when no bar ticks are available from the data source generator
        # Used in universes that 'rollover', e.g. one that has a different
        # set of stocks per quarter
        self.supplemental_data = None

        self.rolling_panel = None
        self.daily_rolling_panel = None

    def handle_data(self, data, *args, **kwargs):
        """
        Point of entry. Process an event frame.
        """
        # extract dates
        dts = [event.datetime for event in data.itervalues()]
        # we have to provide the event with a dt. This is only for
        # checking if the event is outside the window or not so a
        # couple of seconds shouldn't matter. We don't add it to
        # the data parameter, because it would mix dt with the
        # sid keys.
        event = Event()
        event.dt = max(dts)
        event.data = {k: v.__dict__ for k, v in data.iteritems()
                      # Need to check if data has a 'length' to filter
                      # out sids without trade data available.
                      # TODO: expose more of 'no trade available'
                      # functionality to zipline
                      if len(v)}

        # only modify the trailing window if this is
        # a new event. This is intended to make handle_data
        # idempotent.
        if self.last_dt < event.dt:
            self.updated = True
            self._append_to_window(event)
        else:
            self.updated = False

        # return newly computed or cached value
        return self.get_transform_value(*args, **kwargs)

    def _init_panels(self, sids):
        if self.downsample:
            self.rolling_panel = RollingPanel(self.bars_in_day,
                                              self.field_names, sids)

            self.daily_rolling_panel = RollingPanel(self.window_length,
                                                    self.field_names, sids)
        else:
            self.rolling_panel = RollingPanel(self.window_length *
                                              self.bars_in_day,
                                              self.field_names, sids)

    def _append_to_window(self, event):
        self.field_names = self._get_field_names(event)

        if self.static_sids is None:
            sids = set(event.data.keys())
        else:
            sids = self.static_sids

        # the panel sent to the transform code will have
        # columns masked with this set of sids. This is how
        # we guarantee that all (and only) the sids sent to the
        # algorithm's handle_data and passed to the batch
        # transform. See the get_data method to see it applied.
        # N.B. that the underlying panel grows monotonically
        # if the set of sids changes over time.
        self.latest_sids = sids
        # Create rolling panel if not existant
        if self.rolling_panel is None:
            self._init_panels(sids)

        # Store event in rolling frame
        self.rolling_panel.add_frame(event.dt,
                                     pd.DataFrame(event.data,
                                                  index=self.field_names,
                                                  columns=sids))

        # update trading day counters
        # we may get events from non-trading sources which occurr on
        # non-trading days. The book-keeping for market close and
        # trading day counting should only consider trading days.
        if trading.environment.is_trading_day(event.dt):
            _, mkt_close = trading.environment.get_open_and_close(event.dt)
            if self.bars == 'daily':
                # Daily bars have their dt set to midnight.
                mkt_close = trading.environment.normalize_date(mkt_close)
            if event.dt == mkt_close:
                if self.downsample:
                    downsample_panel(self.rolling_panel,
                                     self.daily_rolling_panel,
                                     mkt_close
                                     )
                self.trading_days_total += 1
            self.mkt_close = mkt_close

        self.last_dt = event.dt

        if self.trading_days_total >= self.window_length:
            self.full = True

    def get_transform_value(self, *args, **kwargs):
        """Call user-defined batch-transform function passing all
        arguments.

        Note that this will only call the transform if the datapanel
        has actually been updated. Otherwise, the previously, cached
        value will be returned.
        """
        if self.compute_only_full and not self.full:
            return None

        #################################################
        # Determine whether we should call the transform
        # 0. Support historical/legacy usage of '0' signaling,
        #    'update on every bar'
        if self.refresh_period == 0:
            period_signals_update = True
        else:
        # 1. Is the refresh period over?
            period_signals_update = (
                self.trading_days_total % self.refresh_period == 0)
        # 2. Have the args or kwargs been changed since last time?
        args_updated = args != self.last_args or kwargs != self.last_kwargs
        # 3. Is this a downsampled batch, and is the last event mkt close?
        downsample_ready = not self.downsample or \
            self.last_dt == self.mkt_close

        recalculate_needed = downsample_ready and \
            (args_updated or (period_signals_update and self.updated))
        ###################################################

        if recalculate_needed:
            self.cached = self.compute_transform_value(
                self.get_data(),
                *args,
                **kwargs
            )

        self.last_args = args
        self.last_kwargs = kwargs
        return self.cached

    def get_data(self):
        """Create a pandas.Panel (i.e. 3d DataFrame) from the
        events in the current window.

        Returns:
        The resulting panel looks like this:
        index : field_name (e.g. price)
        major axis/rows : dt
        minor axis/colums : sid
        """
        if self.downsample:
            data = self.daily_rolling_panel.get_current()
        else:
            data = self.rolling_panel.get_current()

        if self.supplemental_data:
            for item in data.items:
                if item not in self.supplemental_data.items:
                    continue
                for dt in data.major_axis:
                    try:
                        supplemental_for_dt = self.supplemental_data.ix[
                            item, dt, :]
                    except KeyError:
                        # Only filling in data available in supplemental data.
                        supplemental_for_dt = None

                    if supplemental_for_dt is not None:
                        data[item].ix[dt] = \
                            supplemental_for_dt.combine_first(
                                data[item].ix[dt])

        # screen out sids no longer in the multiverse
        data = data.ix[:, :, self.latest_sids]
        if self.clean_nans:
            # Fills in gaps of missing data during transform
            # of multiple stocks. E.g. we may be missing
            # minute data because of illiquidity of one stock
            data = data.fillna(method='ffill')

        # Hold on to a reference to the data,
        # so that it's easier to find the current data when stepping
        # through with a debugger
        self._curr_data = data

        return data

    def get_value(self, *args, **kwargs):
        raise NotImplementedError(
            "Either overwrite get_value or provide a func argument.")

    def __call__(self, f):
        self.compute_transform_value = f
        return self.handle_data

    def _extract_field_names(self, event):
        # extract field names from sids (price, volume etc), make sure
        # every sid has the same fields.
        sid_keys = []
        for sid in event.data.itervalues():
            keys = set([name for name, value in sid.items()
                        if isinstance(value,
                                      (int,
                                       float,
                                       numpy.integer,
                                       numpy.float,
                                       numpy.long))
                        ])
            sid_keys.append(keys)

        # with CUSTOM data events, there may be different fields
        # per sid. So the allowable keys are the union of all events.
        union = set.union(*sid_keys)
        unwanted_fields = set(['portfolio', 'sid', 'dt', 'type',
                               'datetime', 'source_id'])
        return union - unwanted_fields

    def _get_field_names(self, event):
        if self.initial_field_names is not None:
            return self.initial_field_names
        else:
            self.latest_names = self._extract_field_names(event)
            return set.union(self.field_names, self.latest_names)