def current_bar_compared_to_open(ticker, date, _):
    """ Price compared to the beginning of the previous bars.

    The relative change in the mean, min, and max price compared to the
    beginning of the minute, hour, day, and a few refrequencies in between.

    """

    bars = bar_properties.current_bar(ticker, date)
    trading_hours = data.get_trading_hours_index(ticker, date)
    df = pd.DataFrame(index=bars.index)

    # Calculate relative to opening of minutes/hour/day.
    measures = ('price', 'price_min', 'price_max')
    last_opens = ('1min', '5min', '10min', '30min', '1H', '1D')
    for i in last_opens:
        price = bars['price'].copy()
        price[~price.index.isin(
            pd.
            date_range(datetime.datetime.combine(date, datetime.time(9, 30)),
                       datetime.datetime.combine(date, datetime.time(16, 0)),
                       freq=i))] = np.nan
        price = price.fillna(method='ffill')
        for measure in measures:
            df[f'open_{i}_{measure}'] = bars[measure] / price - 1

    return df.reindex(trading_hours)
def consecutive_of_increasing_bars(ticker, date, _):
    """ The number of consequtive bars that increased.

    The number of consecutive bars into the past than increased in price, count,
    or volume after applying a moving average smoothing. The time window spans
    from 1 second to 30 minutes.

    """

    bars = bar_properties.current_bar(ticker, date)
    trading_hours = data.get_trading_hours_index(ticker, date)
    df = pd.DataFrame(index=bars.index)

    # How much time since last decreased.
    measures = ('price', 'count', 'volume')
    windows = ('1S', '3S', '5S', '10S', '30S', '1min', '3min', '5min', '10min',
               '30min')
    for i in windows:
        rolling = bars.rolling(i, min_periods=1).mean()
        for measure in measures:
            signs = np.sign(rolling[measure].diff())
            df[f'{i}_{measure}_since_down'] = signs.eq(1).groupby(
                (signs != signs.shift()).cumsum()).transform('cumsum')

    return df.reindex(trading_hours)
def current_bar_compared_to_high_and_low(ticker, date, _):
    """ Price compared to the previous high and low.

    The relative change in the mean, min, and max price compared to the high and
    low of a previous time window, stretching from a minute to the beginning of
    the day (currently not any previous days).

    """

    bars = bar_properties.current_bar(ticker, date)
    trading_hours = data.get_trading_hours_index(ticker, date)
    df = pd.DataFrame(index=bars.index)

    # Calculate relative to time high and low.
    measures = ('price', 'price_min', 'price_max')
    windows = ('1min', '3min', '5min', '10min', '30min', '1H', '1D')
    for i in windows:
        rolling = bars.shift().rolling(i, min_periods=1)
        if i == '1D':
            rolling = bars.shift().reindex(trading_hours).rolling(
                i, min_periods=1)
        for measure in measures:
            df[f'{i}_low_{measure}'] = (
                bars[measure] / rolling['price'].min() - 1)
            df[f'{i}_high_{measure}'] = (
                bars[measure] / rolling['price'].max() - 1)

    return df.reindex(trading_hours)
def recent_bars_compared_to_preceding(ticker, date, params):
    """ Price and volume of recent aggregate bars compared to the one before it.

    The price (including min, max, and std) and volume (including mean, min,
    max, and std) of a number of recent bars compared to the bar preceding it.

    Params:
        "periods_to_go_back" (int): The number of periods into the past to use
            as features.
    """

    periods_to_go_back = params.get('periods_to_go_back', 60)

    bars = bar_properties.current_bar(ticker, date)
    trading_hours = data.get_trading_hours_index(ticker, date)

    dfs = []

    bar_changes = pd.DataFrame(index=bars.index)
    bar_changes['price'] = bars['price'].pct_change()
    measures = [
        'price_min_relative', 'price_max_relative', 'price_std_relative',
        'volume', 'volume_min', 'volume_max', 'volume_mean', 'volume_std'
    ]
    for measure in measures:
        bar_changes[measure] = bars[measure].diff()
    for i in range(1, periods_to_go_back):
        dfs.append(bar_changes.shift(i).add_suffix(f'_{i}S_ago_vs_{i-1}S ago'))

    return pd.concat(dfs, axis=1, sort=False,
                     copy=False).reindex(trading_hours)
def proportion_of_increasing_bars(ticker, date, _):
    """ Proportion of recent bars that increased.

    The proportion of aggregate bars in a time window that increased in price,
    count, or volume (each a separate features). The time window spans from 1
    second to the beginning of the day.

    """

    bars = bar_properties.current_bar(ticker, date)
    trading_hours = data.get_trading_hours_index(ticker, date)
    df = pd.DataFrame(index=bars.index)

    # Increase or decrease.
    measures = ('price', 'count', 'volume')
    for measure in measures:
        df[f'{measure}_inc_sign'] = np.sign(bars[measure].diff())

    # Proportion of increases in the last seconds/minutes.
    measures = ('price', 'count', 'volume')
    windows = ('3S', '5S', '10S', '30S', '1min', '3min', '5min', '10min',
               '30min', '1H', '1D')
    for i in windows:
        rolling = df.eq(1).rolling(i, min_periods=1)
        if i == '1D':
            rolling = df.eq(1).reindex(trading_hours).rolling(i, min_periods=1)
        for measure in measures:
            column = f'{measure}_inc_sign'
            df[f'{i}_{column}'] = (rolling[column].sum() /
                                   rolling[column].count())

    return df.reindex(trading_hours)
def recent_bars_compared_to_current(ticker, date, params):
    """ Price and volume of recent aggregate bars.

    The price (including min, max, and std) and volume (including mean, min,
    max, and std) of a number of recent bars normalized to now.

    Params:
        "periods_to_go_back" (int): The number of periods into the past to use
            as features.

    """

    periods_to_go_back = params.get('periods_to_go_back', 60)

    bars = bar_properties.current_bar(ticker, date)
    trading_hours = data.get_trading_hours_index(ticker, date)

    dfs = []
    measures = [
        'price_min_relative', 'price_max_relative', 'price_std_relative',
        'volume', 'volume_min', 'volume_max', 'volume_mean', 'volume_std'
    ]
    for i in range(1, periods_to_go_back + 1):
        df = bars[measures] - bars[measures].shift(i)
        df['price'] = bars['price'].pct_change(i)
        dfs.append(df.add_suffix(f'_{i}S_ago_vs_now'))

    return pd.concat(dfs, axis=1, sort=False,
                     copy=False).reindex(trading_hours)
Example #7
0
def time_since_holiday(ticker, date, _):
    """ The time since a holiday.

    The number days since the last holiday causing exchanges to be closed when
    it would have otherwise been open. Also includes whether a day is a half-day
    where the exchanges closes early due to a holiday (e.g. the day after
    Thanksgiving).

    """

    open_dates = pd.DatetimeIndex(
        data.get_open_dates(ticker,
                            datetime.date(date.year, 1, 1),
                            datetime.date(date.year, 12, 31),
                            exclude_future=False))
    holidays = data.db.get_holidays(data.exchange_for_ticker(ticker))

    df = pd.DataFrame(index=data.get_trading_hours_index(ticker, date))

    half_days = [day for day, hours in holidays if hours == 'half']
    df['is_half_day'] = int(date in half_days)

    timestamp = pd.Timestamp(date)
    closed_days = pd.DatetimeIndex(
        [d for d, hours in holidays if hours == 'closed'])
    last_holiday = closed_days[closed_days.get_loc(timestamp, 'ffill')]
    next_holiday = closed_days[closed_days.get_loc(timestamp, 'bfill')]
    df['since_holiday'] = open_dates.get_loc(timestamp) - open_dates.get_loc(
        last_holiday, 'bfill')
    df['until_holiday'] = open_dates.get_loc(
        next_holiday, 'ffill') - open_dates.get_loc(timestamp)

    return df
Example #8
0
def current_time_and_date(ticker, date, _):
    """ The time of day and date.

    The second, minute, and hour of the day, as well the day of the week, month,
    quarter, and year.

    """

    time_periods = data.get_trading_hours_index(ticker, date)
    df = pd.DataFrame(index=time_periods)

    df['time'] = (time_periods -
                  pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
    df['year'] = time_periods.year
    df['day'] = time_periods.day
    df['hour'] = time_periods.hour
    df['minute'] = time_periods.minute
    df['second'] = time_periods.second

    categorical_dfs = []
    categorical_times = [
        (time_periods.quarter, 'quarter', range(1, 4 + 1)),
        (time_periods.month, 'month', range(1, 12 + 1)),
        (time_periods.dayofweek, 'dayofweek', range(0, 4 + 1)),
    ]
    for values, prefix, categories in categorical_times:
        categorical_dfs.append(
            pd.get_dummies(values).T.reindex(categories).T.set_index(
                time_periods).fillna(0).astype(int).add_prefix(prefix + '_'))

    return pd.concat([df] + categorical_dfs, axis=1, sort=False, copy=False)
Example #9
0
def time_since_and_until_start_of(ticker, date, _):
    """ The time since and until the beginning of an significant period.

    The seconds since trading started for the day and until trading closes for
    the day (may be different on half-days after holidays), as well as the
    number of business days since the first and until the last business day of
    the month, quarter, and year.

    """

    time_periods = data.get_trading_hours_index(ticker, date)
    df = pd.DataFrame(index=time_periods)

    # Time from/until start/end of day
    df['since_start_of_day'] = (time_periods - time_periods[0]).seconds
    df['until_end_of_day'] = (time_periods[-1] - time_periods).seconds

    # Business days from/until first/last business day of the year/quarter/month.
    open_dates = pd.DatetimeIndex(
        data.get_open_dates(ticker,
                            datetime.date(date.year, 1, 1),
                            datetime.date(date.year, 12, 31),
                            exclude_future=False))
    quarter_dates = open_dates[open_dates.quarter == pd.Timestamp(
        date).quarter]
    month_dates = open_dates[open_dates.month == date.month]
    df['since_year_start'] = open_dates.date.tolist().index(date)
    df['until_year_end'] = open_dates.date[::-1].tolist().index(date)
    df['since_quarter_start'] = quarter_dates.date.tolist().index(date)
    df['until_quarter_end'] = quarter_dates.date[::-1].tolist().index(date)
    df['since_month_start'] = month_dates.date.tolist().index(date)
    df['until_month_end'] = month_dates.date[::-1].tolist().index(date)

    return df
def current_bar_stats(ticker, date, _):
    """ Stats of the price and volume of the current time period.

    Mean, median, min, max, and standard deviation of price, volume, and price-
    adjusted volume (price * volume).

    """
    bars = current_bar(ticker, date)
    return bars.reindex(data.get_trading_hours_index(ticker, date))
def current_bar_compared_to_rolling(ticker, date, _):
    """ Price and volume compared to a rolling average of previous periods.

    The relative changes in the mean, min, max, and std of the price and volume
    compared to a rolling average. The rolling average stretches from 3 seconds
    to the beginning of the day (but currently not any previous days).

    """

    bars = bar_properties.current_bar(ticker, date)
    trading_hours = data.get_trading_hours_index(ticker, date)
    df = pd.DataFrame(index=bars.index)

    # Calculate relative to rolling averages. For all measures except the price,
    # the absolute difference is calculated instead of the relative difference
    # to prevent infinite values.
    measures = (
        'price',
        'price_min_relative',
        'price_max_relative',
        'price_std_relative',
        'volume',
        'volume_mean',
        'volume_min',
        'volume_max',
        'volume_std',
        'count',
    )
    windows = ('1S', '3S', '5S', '10S', '30S', '1min', '3min', '5min', '10min',
               '30min', '1H', '1D')
    for i in windows:
        rolling = bars.shift().rolling(i, min_periods=1)
        if i == '1D':
            rolling = bars.shift().reindex(trading_hours).rolling(
                i, min_periods=1)
        for measure in measures:
            if measure == 'price':
                df[f'{i}_{measure}'] = bars[measure] / rolling[measure].mean(
                ) - 1
            else:
                df[f'{i}_{measure}'] = bars[measure] - rolling[measure].mean()

    return df.reindex(trading_hours)
def current_bar(ticker, date):
    """ Calculate stats of trades for every second.

    Calculates mean, median, min, max, and standard deviation of price, volume,
    and price-adjusted volume (price * volume).

    """

    bars = pd.DataFrame(index=data.get_trading_hours_index(
        ticker, date, extended_hours=True
    ))

    # Price (weighted mean), count, and volume.
    bars = bars.join([
        data.get_bars(ticker, date, 'weighted_mean', extended_hours=True)
            .rename('price')
            .fillna(method='ffill'),
        data.get_bars(ticker, date, 'count', extended_hours=True)['price']
            .rename('count')
            .fillna(0),
        data.get_bars(ticker, date, 'sum', extended_hours=True)['volume']
            .rename('volume')
            .fillna(0),
        data.get_bars(ticker, date, 'sum', extended_hours=True)['dollar_volume']
            .rename('dollar_volume')
            .fillna(0),
    ])

    # Price, volume, and price*volume: mean, median, min, max, std, and sum.
    for agg in ['mean', 'median', 'min', 'max', 'std']:
        df = data.get_bars(
            ticker, date, agg, extended_hours=True
        ).add_suffix('_' + agg)

        # In case of no trades for a bar, fill with the previous bar for
        # averages, with the mean for min and max, and with 0 for the standard
        # deviation.
        if agg in ('mean', 'median'):
            df = df.fillna(method='ffill')
        elif agg in ('min', 'max'):
            for prefix in ('price', 'volume', 'dollar_volume'):
                fill_with = bars[prefix + ('' if prefix == 'price' else '_mean')]
                df[f'{prefix}_{agg}'] = df[f'{prefix}_{agg}'].fillna(fill_with)
        elif agg in ('std',):
            df = df.fillna(0)

        bars = bars.join(df)

    # Stats relative to mean.
    measures = ('median', 'min', 'max', 'std')
    prefixes = ('price', 'volume')
    for prefix in prefixes:
        for measure in measures:
            relative_to = 'price' if prefix == 'price' else 'volume_mean'
            bars[f'{prefix}_{measure}_relative'] = (
                bars[f'{prefix}_{measure}'] / bars[relative_to] - 1
            )

    # Center standard deviation at 0.
    bars[[c for c in bars.columns if c.endswith('_std')]] += 1

    return bars
def top_recent_trades(ticker, date, params):
    """ Get details of recent trades for each second during the selected date.

    For each timepoint, a number of the most recent trades are selected and
    the three properties of each trade are listed:
    - The price-weighted volume (volume * price).
    - The relative price difference compared to the most recent established
        price.
    - The time in nanoseconds since the trade happened.

    The top trades by price and volume and the bottom trades by price are
    selected and their properties are returned.

    Params:
        "num_of_trades" (int): The number of recent trades to summarize.
        "num_of_top_trades" (int): The number of top trades to use as features.

    """

    num_of_trades = params.get('num_of_trades', 100)
    num_of_top_trades = params.get('num_of_top_trades', 10)

    # Get all trades and price aggregate per second.
    trades = data.get_trades(ticker, date)
    bars = data.get_bars(ticker,
                         date,
                         agg='weighted_mean',
                         smooth_periods=3,
                         extended_hours=True).fillna(method='ffill')
    trade_hours_index = data.get_trading_hours_index(ticker, date)

    # Convert all data to numpy ndarrays outside loop for better performance.
    previous_price = bars.shift(1).reindex(trade_hours_index)
    trades = trades.sort_values('time', ascending=False)  # latest first
    trade_price_arr = trades['price'].to_numpy()
    trade_volume_arr = trades['volume'].to_numpy()
    trade_timestamp_arr = trades['time'].to_numpy(int)

    # Iterate all time points, selecting the attributes of the most recent
    # trades and summarizing them into a dataframe.
    recent_prices = np.full((len(trade_hours_index), num_of_trades), np.nan)
    recent_volumes = np.full((len(trade_hours_index), num_of_trades), np.nan)
    recent_times = np.full((len(trade_hours_index), num_of_trades), np.nan)
    for i, time in enumerate(trade_hours_index.astype(int)):
        first_idx = np.argmax(trade_timestamp_arr < time)
        last_idx = first_idx + num_of_trades

        price = trade_price_arr[first_idx:last_idx]
        recent_prices[i] = (price - previous_price[i]) / previous_price[i]
        recent_volumes[i] = trade_volume_arr[first_idx:last_idx] * price
        recent_times[i] = time - trade_timestamp_arr[first_idx:last_idx]

    # Sort recent trades by price and volume, selecting the top and bottom
    # trades.
    idx_high_price = np.fliplr(np.argsort(recent_prices,
                                          axis=1))[:, :num_of_top_trades]
    idx_low_price = np.argsort(recent_prices, axis=1)[:, :num_of_top_trades]
    idx_volume = np.fliplr(np.argsort(recent_volumes,
                                      axis=1))[:, :num_of_top_trades]

    features = [
        ('price_of_trade_with_{}_highest_price', recent_prices,
         idx_high_price),
        ('volume_of_trade_with_{}_highest_price', recent_volumes,
         idx_high_price),
        ('time_of_trade_with_{}_highest_price', recent_times, idx_high_price),
        ('price_of_trade_with_{}_lowest_price', recent_prices, idx_low_price),
        ('volume_of_trade_with_{}_lowest_price', recent_volumes,
         idx_low_price),
        ('time_of_trade_with_{}_lowest_price', recent_times, idx_low_price),
        ('price_of_trade_with_{}_highest_volume', recent_prices, idx_volume),
        ('volume_of_trade_with_{}_highest_volume', recent_volumes, idx_volume),
        ('time_of_trade_with_{}_highest_volume', recent_times, idx_volume),
    ]

    df = pd.DataFrame(index=trade_hours_index)
    for feature_names, recent_property, idx in features:
        df[[
            feature_names.replace('{}', str(j)) for j in range(num_of_top_trades)
        ]] = \
            np.take_along_axis(recent_property, idx, axis=1)

    return df