コード例 #1
0
def get_labels_numba(price_series_np_array, t_events_np_array_epoch,
                     look_forward_window, min_sample_length, step):
    t1_array_numba = np.zeros(
        len(t_events_np_array_epoch))  # Array of label end times
    t1_array_numba[:] = np.nan
    t_values_array_numba = np.zeros(
        len(t_events_np_array_epoch))  # Array of trend t-values
    t_values_array_numba[:] = np.nan
    for i in prange(len(t_events_np_array_epoch)):
        subset_np_array = price_series_np_array[i:i + look_forward_window]
        subset_np_array_epoch = t_events_np_array_epoch[i:i +
                                                        look_forward_window]
        if len(subset_np_array) >= look_forward_window:
            # Loop over possible look-ahead windows to get the one which yields maximum t values for b_1 regression coef
            max_abs_t_value = -np.inf  # Maximum abs t-value of b_1 coefficient among l values
            max_t_value_index = None  # Index with maximum t-value
            max_t_value = None  # Maximum t-value signed

            for forward_window in range(min_sample_length,
                                        len(subset_np_array), step):
                y_subset = subset_np_array[:forward_window].reshape(
                    -1, 1)  # y{t}:y_{t+l}

                y_subset_np_array_epoch = subset_np_array_epoch[:
                                                                forward_window]

                # Array of [1, 0], [1, 1], [1, 2], ... [1, l] # b_0, b_1 coefficients
                X_subset = np.ones((y_subset.shape[0], 2))
                X_subset[:, 1] = np.arange(y_subset.shape[0])

                # Get regression coefficients estimates
                # start = time.time()
                b_mean_, b_std_ = get_betas(X_subset, y_subset)
                # end = time.time()
                # print("get_betas time")
                # print(end - start)
                # Check if l gives the maximum t-value among all values {0...L}
                t_beta_1 = (b_mean_[1] / np.sqrt(b_std_[1, 1]))[0]
                if abs(t_beta_1) > max_abs_t_value:
                    max_abs_t_value = abs(t_beta_1)
                    max_t_value = t_beta_1
                    max_t_value_index = forward_window

            label_endtime_index = y_subset_np_array_epoch[max_t_value_index -
                                                          1]
            # import pdb
            # pdb.set_trace()
            t1_array_numba[i] = label_endtime_index
            t_values_array_numba[i] = max_t_value
        else:
            t1_array_numba[i] = np.nan
            t_values_array_numba[i] = np.nan
    return t1_array_numba, t_values_array_numba
コード例 #2
0
def get_trades_based_amihud_lambda(log_ret: list,
                                   dollar_volume: list) -> List[float]:
    """
    Get Amihud lambda from trades data, p.288-289.

    :param log_ret: (list) of log returns
    :param dollar_volume: (list) of dollar volumes (price * size)
    :return: (float) Amihud lambda for a bar
    """
    X = np.array(dollar_volume).reshape(-1, 1)
    y = np.abs(np.array(log_ret))
    coef, std = get_betas(X, y)
    t_value = coef[0] / std[0]
    return [coef[0], t_value[0]]
コード例 #3
0
def get_trades_based_hasbrouck_lambda(log_ret: list, dollar_volume: list,
                                      aggressor_flags: list) -> List[float]:
    """
    Get Hasbrouck lambda from trades data, p.289-290.

    :param log_ret: (list) of log returns
    :param dollar_volume: (list) of dollar volumes (price * size)
    :param aggressor_flags: (list) of trade directions [-1, 1]  (tick rule or aggressor side can be used to define)
    :return: (list) Hasbrouck lambda for a bar and t value
    """
    X = (np.sqrt(np.array(dollar_volume)) * np.array(aggressor_flags)).reshape(
        -1, 1)
    y = np.abs(np.array(log_ret))
    coef, std = get_betas(X, y)
    t_value = coef[0] / std[0]
    return [coef[0], t_value[0]]
コード例 #4
0
def get_trades_based_kyle_lambda(price_diff: list, volume: list,
                                 aggressor_flags: list) -> List[float]:
    """
    Get Kyle lambda from trades data, p.286-288.

    :param price_diff: (list) of price diffs
    :param volume: (list) of trades sizes
    :param aggressor_flags: (list) of trade directions [-1, 1]  (tick rule or aggressor side can be used to define)
    :return: (list) Kyle lambda for a bar and t-value
    """
    signed_volume = np.array(volume) * np.array(aggressor_flags)
    X = np.array(signed_volume).reshape(-1, 1)
    y = np.array(price_diff)
    coef, std = get_betas(X, y)
    t_value = coef[0] / std[0]
    return [coef[0], t_value[0]]
コード例 #5
0
def get_trades_based_amihud_lambda(log_ret: list,
                                   dollar_volume: list) -> List[float]:
    """
    Advances in Financial Machine Learning, p.288-289.

    Get Amihud lambda from trades data

    :param log_ret: (list) Log returns
    :param dollar_volume: (list) Dollar volumes (price * size)
    :return: (float) Amihud lambda for a bar
    """
    X = np.array(dollar_volume).reshape(-1, 1)
    y = np.abs(np.array(log_ret))
    coef, std = get_betas(X, y)
    t_value = coef[0] / std[0] if std[0] > 0 else np.array([0])
    return [coef[0], t_value[0]]
コード例 #6
0
def _get_dfc_for_t(series: pd.Series, molecule: list) -> pd.Series:
    """
    Get Chow-Type Dickey-Fuller Test statistics for each index in molecule

    :param series: (pd.Series) Series to test
    :param molecule: (list) Dates to test
    :return: (pd.Series) Statistics for each index from molecule
    """

    dfc_series = pd.Series(index=molecule, dtype='float64')

    for index in molecule:
        series_diff = series.diff().dropna()
        series_lag = series.shift(1).dropna()
        series_lag[:index] = 0  # D_t* indicator: before t* D_t* = 0

        y = series_diff.loc[series_lag.index].values
        x = series_lag.values
        coefs, coef_vars = get_betas(x.reshape(-1, 1), y)
        b_estimate, b_var = coefs[0], coef_vars[0][0]
        dfc_series[index] = b_estimate / (b_var**0.5)

    return dfc_series
コード例 #7
0
    def test_sadf_test(self):
        """
        Test get_sadf function
        """

        log_prices = np.log(self.data.close)
        lags_int = 5
        lags_array = [1, 2, 5, 7]
        min_length = 20

        linear_sadf = get_sadf(log_prices,
                               model='linear',
                               add_const=True,
                               min_length=min_length,
                               lags=lags_int)
        linear_sadf_no_const_lags_arr = get_sadf(log_prices,
                                                 model='linear',
                                                 add_const=False,
                                                 min_length=min_length,
                                                 lags=lags_array)
        quadratic_sadf = get_sadf(log_prices,
                                  model='quadratic',
                                  add_const=True,
                                  min_length=min_length,
                                  lags=lags_int)

        sm_poly_1_sadf = get_sadf(log_prices,
                                  model='sm_poly_1',
                                  add_const=True,
                                  min_length=min_length,
                                  lags=lags_int)
        sm_poly_2_sadf = get_sadf(log_prices,
                                  model='sm_poly_2',
                                  add_const=True,
                                  min_length=min_length,
                                  lags=lags_int)
        sm_power_sadf = get_sadf(log_prices,
                                 model='sm_power',
                                 add_const=True,
                                 min_length=min_length,
                                 lags=lags_int)
        sm_exp_sadf = get_sadf(log_prices,
                               model='sm_exp',
                               add_const=True,
                               min_length=min_length,
                               lags=lags_int)

        sm_power_sadf_phi = get_sadf(log_prices,
                                     model='sm_power',
                                     add_const=True,
                                     min_length=min_length,
                                     lags=lags_int,
                                     phi=0.5)
        sm_exp_sadf_phi = get_sadf(log_prices,
                                   model='sm_exp',
                                   add_const=True,
                                   min_length=min_length,
                                   lags=lags_int,
                                   phi=0.5)

        self.assertEqual(log_prices.shape[0] - min_length - lags_int - 1,
                         sm_power_sadf.shape[0])  # -1 for series_diff
        self.assertEqual(log_prices.shape[0] - min_length - lags_int - 1,
                         linear_sadf.shape[0])
        self.assertEqual(log_prices.shape[0] - min_length - lags_int - 1,
                         quadratic_sadf.shape[0])
        self.assertEqual(log_prices.shape[0] - min_length - lags_int - 1,
                         sm_poly_1_sadf.shape[0])
        self.assertEqual(log_prices.shape[0] - min_length - lags_int - 1,
                         sm_poly_2_sadf.shape[0])
        self.assertEqual(log_prices.shape[0] - min_length - lags_int - 1,
                         sm_exp_sadf.shape[0])
        self.assertEqual(log_prices.shape[0] - min_length - lags_int - 1,
                         sm_exp_sadf_phi.shape[0])

        self.assertAlmostEqual(sm_power_sadf.mean(), 28.954, delta=1e-3)
        self.assertAlmostEqual(sm_power_sadf.iloc[29], 17.369, delta=1e-3)

        self.assertAlmostEqual(linear_sadf.mean(), -0.669, delta=1e-3)
        self.assertAlmostEqual(linear_sadf[29], -0.717, delta=1e-3)

        self.assertAlmostEqual(linear_sadf_no_const_lags_arr.mean(),
                               1.899,
                               delta=1e-3)
        self.assertAlmostEqual(linear_sadf_no_const_lags_arr[29],
                               1.252,
                               delta=1e-3)

        self.assertAlmostEqual(quadratic_sadf.mean(), -1.002, delta=1e-3)
        self.assertAlmostEqual(quadratic_sadf[29], -1.460, delta=1e-3)

        self.assertAlmostEqual(sm_poly_1_sadf.mean(), 26.033, delta=1e-3)
        self.assertAlmostEqual(sm_poly_1_sadf[29], 8.350, delta=1e-3)

        self.assertAlmostEqual(sm_poly_2_sadf.mean(), 26.031, delta=1e-3)
        self.assertAlmostEqual(sm_poly_2_sadf[29], 8.353, delta=1e-3)

        self.assertAlmostEqual(sm_exp_sadf.mean(), 28.916, delta=1e-3)
        self.assertAlmostEqual(sm_exp_sadf[29], 17.100, delta=1e-3)

        self.assertAlmostEqual(sm_power_sadf_phi.mean(), 1.4874, delta=1e-3)
        self.assertAlmostEqual(sm_power_sadf_phi.iloc[29], 2.4564, delta=1e-3)

        self.assertAlmostEqual(sm_exp_sadf_phi.mean(), 1.4787, delta=1e-3)
        self.assertAlmostEqual(sm_exp_sadf_phi[29], 2.4183, delta=1e-3)

        # Trivial series case.
        ones_series = pd.Series(index=log_prices.index,
                                data=np.ones(shape=log_prices.shape[0]))
        trivial_sadf = get_sadf(ones_series,
                                model='sm_power',
                                add_const=True,
                                min_length=min_length,
                                lags=lags_int,
                                phi=0.5)
        self.assertTrue((trivial_sadf.unique() == [
            -np.inf
        ]).all())  # All values should be -np.inf

        # Test rubbish model argument.
        self.assertRaises(ValueError,
                          get_sadf,
                          series=log_prices,
                          model='rubbish_string',
                          add_const=True,
                          min_length=min_length,
                          lags=lags_int)

        # Assert that nans are parsed if singular matrix
        singular_matrix = np.array([[1, 0, 0], [-1, 3, 3], [1, 2, 2]])
        b_mean, b_var = get_betas(singular_matrix, singular_matrix)
        self.assertTrue(b_mean, [np.nan])
        self.assertTrue(b_var, [[np.nan, np.nan]])
コード例 #8
0
def trend_scanning_labels(price_series: pd.Series,
                          t_events: list = None,
                          look_forward_window: int = 20,
                          min_sample_length: int = 5,
                          step: int = 1) -> pd.DataFrame:
    """
    `Trend scanning <https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3257419>`_ is both a classification and
    regression labeling technique.

    That can be used in the following ways:

    1. Classification: By taking the sign of t-value for a given observation we can set {-1, 1} labels to define the
       trends as either downward or upward.
    2. Classification: By adding a minimum t-value threshold you can generate {-1, 0, 1} labels for downward, no-trend,
       upward.
    3. The t-values can be used as sample weights in classification problems.
    4. Regression: The t-values can be used in a regression setting to determine the magnitude of the trend.

    The output of this algorithm is a DataFrame with t1 (time stamp for the farthest observation), t-value, returns for
    the trend, and bin.

    :param price_series: (pd.Series) Close prices used to label the data set
    :param t_events: (list) Filtered events, array of pd.Timestamps
    :param look_forward_window: (int) Maximum look forward window used to get the trend value
    :param min_sample_length: (int) Minimum sample length used to fit regression
    :param step: (int) Optimal t-value index is searched every 'step' indices
    :return: (pd.DataFrame) Consists of t1, t-value, ret, bin (label information). t1 - label endtime, tvalue,
        ret - price change %, bin - label value based on price change sign
    """
    # pylint: disable=invalid-name

    if t_events is None:
        t_events = price_series.index

    t1_array = []  # Array of label end times
    t_values_array = []  # Array of trend t-values

    for index in t_events:
        subset = price_series.loc[
            index:].iloc[:look_forward_window]  # Take t:t+L window
        if subset.shape[0] >= look_forward_window:
            # Loop over possible look-ahead windows to get the one which yields maximum t values for b_1 regression coef
            max_abs_t_value = -np.inf  # Maximum abs t-value of b_1 coefficient among l values
            max_t_value_index = None  # Index with maximum t-value
            max_t_value = None  # Maximum t-value signed

            # Get optimal label end time value based on regression t-statistics
            for forward_window in np.arange(min_sample_length, subset.shape[0],
                                            step):
                y_subset = subset.iloc[:forward_window].values.reshape(
                    -1, 1)  # y{t}:y_{t+l}

                # Array of [1, 0], [1, 1], [1, 2], ... [1, l] # b_0, b_1 coefficients
                X_subset = np.ones((y_subset.shape[0], 2))
                X_subset[:, 1] = np.arange(y_subset.shape[0])

                # Get regression coefficients estimates
                b_mean_, b_std_ = get_betas(X_subset, y_subset)
                # Check if l gives the maximum t-value among all values {0...L}
                t_beta_1 = (b_mean_[1] / np.sqrt(b_std_[1, 1]))[0]
                if abs(t_beta_1) > max_abs_t_value:
                    max_abs_t_value = abs(t_beta_1)
                    max_t_value = t_beta_1
                    max_t_value_index = forward_window

            # Store label information (t1, return)
            label_endtime_index = subset.index[max_t_value_index - 1]
            t1_array.append(label_endtime_index)
            t_values_array.append(max_t_value)

        else:
            t1_array.append(None)
            t_values_array.append(None)

    labels = pd.DataFrame({
        't1': t1_array,
        't_value': t_values_array
    },
                          index=t_events)
    labels.loc[:, 'ret'] = price_series.loc[
        labels.t1].values / price_series.loc[labels.index].values - 1
    labels['bin'] = np.sign(labels.t_value)

    return labels