Ejemplo n.º 1
0
    def get_spread(ob_state: dd) -> float:
        best_bid = DataSplitter.get_side("buy", ob_state).sort_values(
            by='price', ascending=False)['price'].max()
        best_ask = DataSplitter.get_side(
            "sell", ob_state).sort_values(by='price')['price'].min()

        return best_ask - best_bid
Ejemplo n.º 2
0
    def get_feed_stats(df: dd) -> Dict[str, Union[int, Any]]:
        """Calculate and print some statistics relating to the data feed"""
        stats = {
            'num_total_msgs':
            get_total(df),
            'num_trades':
            Statistics.get_reason_count('filled', df),
            'num_cancel':
            Statistics.get_reason_count('canceled', df),
            'num_received':
            Statistics.get_type_count('received', df),
            'num_open':
            Statistics.get_type_count('open', df),
            'num_done':
            Statistics.get_type_count('done', df),
            'num_match':
            Statistics.get_type_count('match', df),
            'num_change':
            Statistics.get_type_count('change', df),
            'avg_trade_price':
            Statistics.get_mean('price', DataSplitter.get_trades(df)),
            'std_dev_trade_price':
            Statistics.get_std_dev('price', DataSplitter.get_trades(df))
        }

        return stats
    def __init__(self, start_ob_state_df: pd.DataFrame, st: datetime.datetime,
                 start_seq: int):
        self.column_order = ['price', 'order_id', 'side', 'size']

        bids = DataSplitter.get_side("buy", start_ob_state_df)
        # have to change the sign of the prices column so that we can use a min heap as a max heap...
        bids['price'] = bids['price'].apply(lambda x: -x)
        bids = bids[self.column_order]
        bids = bids.values.tolist()
        self.bids_max_heap = list(map(tuple, bids))
        self.bids_max_heap.sort()
        heapq.heapify(self.bids_max_heap)

        asks = DataSplitter.get_side("sell", start_ob_state_df)
        asks = asks[self.column_order]
        asks = asks.values.tolist()
        self.asks_min_heap = list(map(tuple, asks))
        self.asks_min_heap.sort()
        heapq.heapify(self.asks_min_heap)

        # Keep track of order ids which should no longer be on the book
        self.invalid_order_ids = set()

        self.st = st

        self.start_seq = start_seq
Ejemplo n.º 4
0
    def get_buy_sell_volume_ratio(df: dd):
        buys = DataSplitter.get_side("buy", df)
        sells = DataSplitter.get_side("sell", df)

        buy_vol = buys['size'].sum()
        sell_vol = sells['size'].sum()

        return Statistics.get_ratio(buy_vol, sell_vol)
Ejemplo n.º 5
0
    def get_limit_market_order_ratio(df: dd):
        limits = DataSplitter.get_limit_orders_from_feed(df)
        markets = DataSplitter.get_market_orders_from_feed(df)

        num_limits = len(limits)
        num_markets = len(markets)

        return Statistics.get_ratio(num_limits, num_markets)
Ejemplo n.º 6
0
    def load_split_data(real_root, start_time, end_time, product):
        feed_df = DataLoader().load_feed(real_root, start_time, end_time,
                                         product)
        feed_df = DataSplitter.get_product(product, feed_df)

        orders_df = DataSplitter.get_orders(feed_df)
        trades_df = DataSplitter.get_trades(feed_df)
        cancels_df = DataSplitter.get_cancellations(feed_df)

        return orders_df, trades_df, cancels_df
Ejemplo n.º 7
0
    def test_sim_spread_plot(self):
        plt.figure(figsize=(12, 8))

        product = "LTC-USD"
        root = "/Users/jamesprince/project-data/results/sims/LTC-USD/2018-05-17/01:00:00/"

        st = datetime.datetime(2018, 5, 17, 1, 0, 0)
        et = datetime.datetime(2018, 5, 17, 1, 5, 0)

        all_sims = DataLoader().load_sim_data(root)
        # orders_dd, trades_dd, cancels_dd, midprices_dd, best_bids_dd, best_asks_dd

        orders_df = all_sims[0][0].compute()
        cancels_df = all_sims[0][2].compute()
        midprice_df = all_sims[0][3].compute()

        conf = configparser.ConfigParser()
        conf.read("../config/backtest.ini")
        config = BacktestConfig(conf)

        # limit_orders = DataSplitter.get_limit_orders(orders_df)
        # limit_orders['seconds'] = (limit_orders['time'] - limit_orders['time'].iloc[0]).apply(
        #     lambda x: x.total_seconds())
        #
        # buy_limit_orders = DataSplitter.get_side("buy", limit_orders)
        # sell_limit_orders = DataSplitter.get_side("sell", limit_orders)
        #
        # plt.plot(buy_limit_orders['seconds'], buy_limit_orders['price'], 'r+', label="Buy limit orders")
        # plt.plot(sell_limit_orders['seconds'], sell_limit_orders['price'], 'b+', label="Sell limit orders")

        cancels_df['seconds'] = (
            cancels_df['time'] -
            cancels_df['time'].iloc[0]).apply(lambda x: x.total_seconds())

        buy_cancels = DataSplitter.get_side("buy", cancels_df)
        sell_cancels = DataSplitter.get_side("sell", cancels_df)

        plt.plot(buy_cancels['seconds'],
                 buy_cancels['price'],
                 'r+',
                 label="Buy side cancels")
        plt.plot(sell_cancels['seconds'],
                 sell_cancels['price'],
                 'b+',
                 label="Sell side cancels")

        # plt.plot(res_df['seconds'], res_df['best_bid'], label='Best bid price')
        # plt.plot(res_df['seconds'], res_df['best_ask'], label='Best ask price')

        start_price = midprice_df['price'].iloc[0]
        plt.ylim(start_price - 5, start_price + 5)

        plt.legend()
        plt.show()
Ejemplo n.º 8
0
    def compare_order_metrics(real_orders: pd.DataFrame, multi_sim_orders: List[pd.DataFrame]):
        """Compares metrics which only make sense in orders (e.g. buy/sell split)"""

        real_buy_orders = DataSplitter.get_side("buy", real_orders)
        sim_buy_orders = list(map(lambda sim: DataSplitter.get_side("buy", sim), multi_sim_orders))

        print("Buy metrics:")
        Evaluation.compare_metrics(real_buy_orders, sim_buy_orders)

        real_sell_orders = DataSplitter.get_side("sell", real_orders)
        sim_sell_orders = list(map(lambda sim: DataSplitter.get_side("sell", sim), multi_sim_orders))

        print("Sell metrics:")
        Evaluation.compare_metrics(real_sell_orders, sim_sell_orders)
Ejemplo n.º 9
0
    def graph_relative_price_distribution(self, trades_df: dd, other_df: dd, num_bins=100):
        buy_orders = DataSplitter.get_side("buy", other_df)
        sell_orders = DataSplitter.get_side("sell", other_df)

        buy_prices = DataTransformer.get_relative_prices(trades_df, buy_orders)
        buy_prices = buy_prices.apply(lambda x: -x)
        sell_prices = DataTransformer.get_relative_prices(trades_df, sell_orders)

        # Graphing
        self.config.plt.figure(figsize=(12, 8))

        self.graph_distribution(buy_prices, self.data_description + ", Buy Side", "Price relative to most recent trade",
                                bins=num_bins)
        self.graph_distribution(sell_prices, self.data_description + ", Sell Side",
                                "Price relative to most recent trade", bins=num_bins)
Ejemplo n.º 10
0
    def load_feed(cls,
                  root,
                  start_time: datetime,
                  end_time: datetime,
                  product: str,
                  fmt: str = "parquet") -> dd:
        """Loads in a feed of real data and applies formatting to timestamp, price and size columns"""
        # Assume data is on the same day and just hours apart for now
        hour_delta = end_time.hour - start_time.hour
        files_to_load = []

        # TODO: introduce wrapping over days
        # TODO: split this function up!
        # TODO: BUG: struggles to load small blobs of data
        for i in range(0, hour_delta + 1):
            filename = start_time.date().isoformat() + "/" + str(
                "%02i" % (start_time.hour + i)) + "." + fmt
            cls.logger.debug(filename)
            files_to_load.append(filename)

        feed_df = pd.DataFrame()
        for filename in files_to_load:
            file_path = root + filename
            if fmt == "parquet":
                file_df = pd.read_parquet(file_path)
            else:
                file_df = pd.read_csv(file_path)
            file_df = DataSplitter.get_product(product, file_df)
            file_df = DataLoader().format_dd(file_df)
            file_df = file_df[start_time < file_df['time']]
            file_df = file_df[file_df['time'] < end_time]
            feed_df = feed_df.append(file_df)

        return feed_df
Ejemplo n.º 11
0
    def plot_orderbook(ob_state, xwindow, log_y_scale=False):
        import matplotlib.pyplot as plt
        plt.figure(figsize=(12, 8))
        bids = DataSplitter.get_side("buy", ob_state)
        asks = DataSplitter.get_side("sell", ob_state)

        OrderBookCreator.__plot_bid_side(bids, xwindow, percentile=0.9)
        OrderBookCreator.__plot_ask_side(asks, xwindow, percentile=0.9)

        plt.title("Order Book")
        plt.xlabel("Price")
        plt.ylabel("Cumulative size")
        if log_y_scale:
            plt.yscale('log')
        plt.legend()
        plt.show()
Ejemplo n.º 12
0
    def test_compare_order_metrics(self):
        sim_root = self.config.sim_root + self.sim_st.date().isoformat(
        ) + "/" + self.sim_st.time().isoformat() + "/"
        all_sims = DataLoader().load_sim_data(sim_root)
        all_sim_limit_orders = list(
            map(lambda sim: DataSplitter.get_limit_orders(sim[0].compute()),
                all_sims))
        all_sim_market_orders = list(
            map(lambda sim: DataSplitter.get_market_orders(sim[0].compute()),
                all_sims))
        all_sim_trades = list(map(lambda sim: sim[1].compute(), all_sims))
        all_sim_cancels = list(map(lambda sim: sim[2].compute(), all_sims))

        feed_df = DataLoader().load_feed(
            self.config.real_root, self.sim_st,
            self.sim_st + timedelta(seconds=self.config.simulation_window),
            self.config.product)
        real_orders = DataSplitter.get_orders(feed_df)
        real_limit_orders = DataSplitter.get_limit_orders(real_orders)
        real_market_orders = DataSplitter.get_market_orders(real_orders)
        real_trades = DataSplitter.get_trades(feed_df)
        real_trades['size'] = pd.to_numeric(real_trades['remaining_size'])
        real_cancels = DataSplitter.get_cancellations(feed_df)
        real_cancels['size'] = pd.to_numeric(real_cancels['remaining_size'])

        print("Order Buy/Sell limit metrics")
        Evaluation.compare_order_metrics(real_limit_orders,
                                         all_sim_limit_orders)
        print("Order Buy/Sell market metrics")
        Evaluation.compare_order_metrics(real_market_orders,
                                         all_sim_market_orders)
        print("Cancel metrics")
        Evaluation.compare_order_metrics(real_cancels, all_sim_cancels)
        print("Trade metrics")
        Evaluation.compare_metrics(real_trades, all_sim_trades)
Ejemplo n.º 13
0
    def __fetch_real_prices(self):
        df = DataLoader().load_feed(self.config.real_root, self.sim_st,
                                    self.sim_st + timedelta(seconds=self.config.simulation_window), self.config.product)
        trades_df = DataSplitter.get_trades(df)

        trades_df['time'] = DataUtils().get_times_in_seconds_after_start(trades_df['time'])
        trades_df['price'].iloc[0] = DataUtils().get_first_non_nan(trades_df['price'])

        return trades_df[['time', 'price']]
Ejemplo n.º 14
0
    def test_get_orders_per_minute(self):
        product = "LTC-USD"
        root = "/Users/jamesprince/project-data/data/consolidated-feed/"

        st = datetime.datetime(2018, 5, 17, 0, 0, 0)
        et = datetime.datetime(2018, 5, 17, 23, 59, 59)

        feed_df = DataLoader.load_feed(root + product + "/", st, et, product)

        orders = DataSplitter.get_orders(feed_df)
        limit_orders = DataSplitter.get_limit_orders(orders)

        print(
            str(len(limit_orders)) + " total limit orders per day for " +
            product)
        print(
            str(len(limit_orders) / (24 * 60)) +
            " limit orders per minute (on average) for " + product)
Ejemplo n.º 15
0
    def graph_price_time(self, df: dd, data_desc: str, mid: int, ywindow: int):
        self.config.plt.figure(figsize=(12, 8))

        buy_df = DataSplitter.get_side("buy", df)
        sell_df = DataSplitter.get_side("sell", df)

        self.__graph_price_time_set(buy_df, 'r+')
        self.__graph_price_time_set(sell_df, 'b+')

        self.config.plt.xlabel('Time (s)')
        self.config.plt.ylabel('Price ($)')

        ymin, ymax = self.get_y_bounds(mid, ywindow)

        self.config.plt.ylim(ymin, ymax)
        self.config.plt.xlim(0, self.config.simulation_window)

        self.config.plt.title(self.data_description + " " + data_desc + ' price')

        return self.config.plt
Ejemplo n.º 16
0
    def get_price_size_corr(trades_df: dd, limit_orders: dd):
        ret = {}

        for side in ["buy", "sell"]:
            side_df = DataSplitter.get_side(side, limit_orders)

            prices = DataTransformer.get_relative_prices(trades_df, side_df)
            sizes = side_df[side_df['size'].index.isin(prices.index)]['size']

            if side == "buy":
                prices = prices.apply(lambda x: -x)

            ret[side] = Correlations.get_correlation_matrix(prices, sizes)[0,
                                                                           1]

        return ret
Ejemplo n.º 17
0
def get_all_data(st: datetime, config):
    # Get all data which we will use to reconstruct the order book
    all_ob_start_time = st - datetime.timedelta(seconds=config.orderbook_window)
    all_ob_end_time = st
    all_ob_data = DataLoader().load_split_data(config.real_root, all_ob_start_time, all_ob_end_time, config.product)

    # Assume orderbook_window > sampling_window, and therefore filter already loaded ob data
    all_sample_start_time = st - datetime.timedelta(seconds=config.sampling_window)
    all_sample_end_time = st
    all_sampling_data = map(lambda x: DataSplitter.get_between(x, all_sample_start_time, all_sample_end_time),
                            all_ob_data)

    # Get future data
    all_future_data_start_time = st
    all_future_data_end_time = st + datetime.timedelta(seconds=config.sampling_window)
    all_future_data = DataLoader().load_split_data(config.real_root, all_future_data_start_time,
                                                   all_future_data_end_time, config.product)

    return all_ob_data, all_sampling_data, all_future_data
Ejemplo n.º 18
0
    def get_order_stats(df: dd) -> Dict[Union[str, Any], Union[float, Any]]:
        stats = {
            'buy_order_ratio':
            Statistics.get_buy_sell_ratio(df)[0],
            'sell_order_ratio':
            Statistics.get_buy_sell_ratio(df)[1],
            'buy_volume_ratio':
            Statistics.get_buy_sell_volume_ratio(df)[0],
            'sell_volume_ratio':
            Statistics.get_buy_sell_volume_ratio(df)[1],
            'avg_order_size':
            Statistics.get_mean('size', df),
            'std_dev_order_size':
            Statistics.get_std_dev('size', df),
            'avg_sell_order_size':
            Statistics.get_mean('size', DataSplitter.get_side('sell', df)),
            'std_dev_sell_order_size':
            Statistics.get_std_dev('size', DataSplitter.get_side('sell', df)),
            'avg_buy_order_size':
            Statistics.get_mean('size', DataSplitter.get_side('buy', df)),
            'std_dev_buy_order_size':
            Statistics.get_std_dev('size', DataSplitter.get_side('buy', df)),
            'avg_price':
            df['price'].astype('float64').mean(),
            'std_dev_price':
            df['price'].astype('float64').std(),
            'avg_sell_order_price':
            Statistics.get_mean('price', DataSplitter.get_side('sell', df)),
            'std_dev_sell_price':
            Statistics.get_std_dev('price', DataSplitter.get_side('sell', df)),
            'avg_buy_price':
            Statistics.get_mean('price', DataSplitter.get_side('buy', df)),
            'std_dev_buy_order_price':
            Statistics.get_std_dev('price', DataSplitter.get_side('buy', df))
        }

        return stats
Ejemplo n.º 19
0
    def get_lyapunov_exponent_over_time(trades, st, et, step_minutes,
                                        window_minutes):
        num_steps = ((et - st).total_seconds() / 60) / step_minutes
        lyap_exps = []
        times = []
        for i in range(0, int(num_steps)):
            iter_st = st + datetime.timedelta(minutes=step_minutes * i)
            iter_et = iter_st + datetime.timedelta(minutes=window_minutes)

            window = DataSplitter.get_between(trades, iter_st, iter_et)
            prices = np.asarray(window['price'].dropna(), dtype=np.float32)

            if len(prices) == 0:
                continue

            lyap_exp = nolds.lyap_r(prices)
            if lyap_exp > 0:
                lyap_exps.append(lyap_exp)
                times.append(iter_et)
            else:
                pass
        return times, lyap_exps
Ejemplo n.º 20
0
    def get_hurst_exponent_over_time(trades, st, et, step_minutes,
                                     window_minutes):
        num_steps = ((et - st).total_seconds() / 60) / step_minutes
        hurst_exps = []
        times = []
        for i in range(0, int(num_steps)):
            iter_st = st + datetime.timedelta(minutes=step_minutes * i)
            iter_et = iter_st + datetime.timedelta(minutes=window_minutes)

            window = DataSplitter.get_between(trades, iter_st, iter_et)
            prices = np.asarray(window['price'].dropna(), dtype=np.float32)

            if len(prices) == 0:
                continue

            hurst_exp = nolds.hurst_rs(prices)
            # hurst_exp = nolds.dfa(prices) - 1
            print(hurst_exp)
            if 0 < hurst_exp < 1:
                hurst_exps.append(hurst_exp)
                times.append(iter_st)
            else:
                pass
        return times, hurst_exps
Ejemplo n.º 21
0
    def test_orders_per_minute_windowed(self):
        product = "LTC-USD"
        root = "/Users/jamesprince/project-data/data/consolidated-feed/"

        st = datetime.datetime(2018, 5, 17, 0, 0, 0)
        et = datetime.datetime(2018, 5, 17, 23, 59, 59)

        feed_df = DataLoader.load_feed(root + product + "/", st, et, product)

        orders = DataSplitter.get_orders(feed_df)
        limit_orders = DataSplitter.get_limit_orders(orders)
        market_orders = DataSplitter.get_market_orders(orders)

        trades = DataSplitter.get_trades(feed_df)
        cancels = DataSplitter.get_cancellations(feed_df)

        print("Total limit orders: " + str(len(limit_orders)))
        print("Total market orders: " + str(len(market_orders)))
        print("Total trades: " + str(len(trades)))
        print("Total cancels: " + str(len(cancels)))

        # total_vol = trades['remaining_size'].sum()
        # print("Total traded volume: " + str(total_vol))

        window_minutes = 60
        step_minutes = 5

        times = []
        num_limit_orders = []
        num_market_orders = []
        num_trades = []
        num_cancels = []

        traded_vols = []

        for i in range(0, int((24 * 60) / step_minutes - 1)):
            window_st = st + datetime.timedelta(seconds=i * step_minutes * 60)
            window_et = window_st + datetime.timedelta(seconds=window_minutes *
                                                       60)

            limit_orders_this_window = DataSplitter.get_between(
                limit_orders, window_st, window_et)
            market_orders_this_window = DataSplitter.get_between(
                market_orders, window_st, window_et)
            trades_this_window = DataSplitter.get_between(
                trades, window_st, window_et)
            cancels_this_window = DataSplitter.get_between(
                cancels, window_st, window_et)

            times.append(window_st)
            num_limit_orders.append(len(limit_orders_this_window))
            num_market_orders.append(len(market_orders_this_window))
            num_trades.append(len(trades_this_window))
            num_cancels.append(len(cancels_this_window))

            # vol_this_window = trades_this_window['remaining_size'].sum()
            # traded_vols.append(vol_this_window)

        Statistics.plot_metric_daily_comparison(times, num_limit_orders,
                                                num_cancels, "LTC-USD", st,
                                                step_minutes, window_minutes,
                                                "Limit Orders", "Cancels")

        Statistics.plot_metric_daily(times, num_limit_orders, "LTC-USD", st,
                                     step_minutes, window_minutes,
                                     "Limit Orders")
        Statistics.plot_metric_daily(times, num_market_orders, "LTC-USD", st,
                                     step_minutes, window_minutes,
                                     "Market Orders")
        Statistics.plot_metric_daily(times, num_trades, "LTC-USD", st,
                                     step_minutes, window_minutes, "Trades")
        Statistics.plot_metric_daily(times, num_cancels, "LTC-USD", st,
                                     step_minutes, window_minutes, "Cancels")
        Statistics.plot_metric_daily(times, traded_vols, "LTC-USD", st,
                                     step_minutes, window_minutes,
                                     "Traded Volume")
Ejemplo n.º 22
0
    def get_buy_sell_ratio(df: dd) -> (float, float):
        num_buys = len(DataSplitter.get_side("buy", df))
        num_sells = len(DataSplitter.get_side("sell", df))

        return Statistics.get_ratio(num_buys, num_sells)
Ejemplo n.º 23
0
    def check_ob_valid(ob: dd) -> bool:
        highest_buy = DataSplitter.get_side("buy", ob)['price'].max()
        lowest_sell = DataSplitter.get_side("sell", ob)['price'].min()

        return highest_buy < lowest_sell
Ejemplo n.º 24
0
def backtest_mode(st: datetime.datetime = None):
    all_data_st = take_secs(st, max(config.orderbook_window, config.sampling_window))
    all_data_et = add_secs(st, config.num_predictions * config.interval)

    all_data = DataLoader.load_split_data(config.real_root, all_data_st, all_data_et, config.product)

    validate_future = None
    previous_backtest = None
    current_backtest = None
    sim_future = None
    sim_success = False
    sim_st = None

    for i in range(0, config.num_predictions):
        logger.info("Iteration " + str(i))
        sim_st = add_secs(st, config.interval * i)
        sim_et = add_secs(sim_st, config.simulation_window)

        ob_st = take_secs(sim_st, config.orderbook_window)
        ob_et = sim_st

        sam_st = take_secs(sim_st, config.sampling_window)
        sam_et = sim_st

        try:
            logger.info("Gathering data for simulation at: " + sim_st.isoformat())

            all_sampling_data = map(lambda x: DataSplitter.get_between(x, sam_st, sam_et),
                                    all_data)

            all_future_data = map(lambda x: DataSplitter.get_between(x, sim_st, sim_et),
                                  all_data)

            previous_backtest = current_backtest
            current_backtest = Backtest(config, sim_st, all_sampling_data, all_future_data)

        except Exception as e:
            logger.error("Error occurred when gathering data: " + str(e))
            current_backtest = None

        # Initiate simulation prep synchronously
        prep_success = current_backtest.prepare_simulation()

        # Wait for previous simulation to finish
        sim_future, sim_success = wait_on_simulation(sim_future, sim_st, sim_success)

        # Wait for previous validation to finish
        wait_on_validation(validate_future)
        # Set off validation for previous iteration
        validate_future = run_validation_async(previous_backtest, sim_success)

        # Run this current iteration's simulation async
        if current_backtest is not None and prep_success:
            sim_future = current_backtest.run_simulation()

    # Wait for previous validation to finish
    wait_on_validation(validate_future)

    sim_future, sim_success = wait_on_simulation(sim_future, sim_st, sim_success)

    if sim_success:
        logger.info("Starting final validation")
        current_backtest.evaluate_simulation(prog_start)
Ejemplo n.º 25
0
 def test_filter_when_cutoff_after_end(self):
     assert len(
         DataSplitter.get_first_n_nanos(self.df, 15 * 10**9)['time']) == 2
Ejemplo n.º 26
0
 def graph_order_cancel_relative_price_distribution(self, feed_df):
     trades_df = DataSplitter.get_trades(feed_df)
     cancels_df = DataSplitter.get_cancellations(feed_df)
     self.graph_relative_price_distribution(trades_df, cancels_df)
Ejemplo n.º 27
0
    def graph_sides(self, df: dd) -> None:
        btc_usd_price_buy = pd.Series(DataSplitter.get_side('buy', df)['price'].astype('float64').tolist())
        btc_usd_price_sell = pd.Series(DataSplitter.get_side('sell', df)['price'].astype('float64').tolist())

        self.graph_distribution(btc_usd_price_buy, self.data_description + ' buy side', 'Price ($)', bins=50)
        self.graph_distribution(btc_usd_price_sell, self.data_description + ' sell side', 'Price ($)', bins=50)
Ejemplo n.º 28
0
    def generate_sim_params(cls,
                            orders_df,
                            trades_df,
                            cancels_df,
                            feed_df,
                            ob_state,
                            ob_state_seq_num,
                            ob_state_time,
                            graph=False):
        cls.check_has_elements([orders_df, trades_df, cancels_df])

        try:
            params = {}
            distributions = {}
            ratios = {}
            correlations = {}
            discrete_distributions = {}

            # TODO: reduce code duplication and parallelise inverse CDF generation
            with pebble.ProcessPool() as pool:
                price_size_corrs = Correlations.get_price_size_corr(
                    trades_df,
                    DataSplitter.get_limit_orders_from_feed(orders_df))
                correlations['buy_price_size'] = price_size_corrs['buy']
                correlations['sell_price_size'] = price_size_corrs['sell']

                # Sell order prices relative
                sell_orders = DataSplitter.get_side("sell", orders_df)
                sell_prices_relative = DataTransformer.get_prices_relative_to_midprice(
                    ob_state, ob_state_seq_num, ob_state_time, feed_df,
                    sell_orders)
                sell_x, sell_cy = Sample.get_cdf_data(sell_prices_relative)
                discrete_distributions["sell_price_relative"] = {
                    'x': sell_x.tolist(),
                    'cy': sell_cy.tolist()
                }
                Sample.plot_cdf(sell_x, sell_cy,
                                "Sell order prices (relative)")

                # Buy order prices relative
                buy_orders = DataSplitter.get_side("buy", orders_df)
                buy_prices_relative = DataTransformer.get_prices_relative_to_midprice(
                    ob_state, ob_state_seq_num, ob_state_time, feed_df,
                    buy_orders)
                buy_prices_relative = buy_prices_relative.apply(lambda x: -x)
                buy_x, buy_cy = Sample.get_cdf_data(buy_prices_relative)
                discrete_distributions["buy_price_relative"] = {
                    'x': buy_x.tolist(),
                    'cy': buy_cy.tolist()
                }
                Sample.plot_cdf(
                    buy_x, buy_cy,
                    "Buy prices (relative) (flipped for comparison)")

                # Buy side cancel prices relative
                buy_cancels = DataSplitter.get_side("buy", cancels_df)
                buy_cancels_relative = DataTransformer.get_prices_relative_to_midprice(
                    ob_state, ob_state_seq_num, ob_state_time, feed_df,
                    buy_cancels)
                buy_cancels_relative = buy_cancels_relative.apply(lambda x: -x)
                buy_cancels_x, buy_cancels_cy = Sample.get_cdf_data(
                    buy_cancels_relative)
                discrete_distributions["buy_cancels_relative"] = {
                    'x': buy_cancels_x.tolist(),
                    'cy': buy_cancels_cy.tolist()
                }
                Sample.plot_cdf(
                    buy_cancels_x, buy_cancels_cy,
                    "Buy cancel prices (relative) (flipped for comparison)")

                # Sell side cancel prices relative
                sell_cancels = DataSplitter.get_side("sell", cancels_df)
                sell_cancels_relative = DataTransformer.get_prices_relative_to_midprice(
                    ob_state, ob_state_seq_num, ob_state_time, feed_df,
                    sell_cancels)
                sell_cancels_x, sell_cancels_cy = Sample.get_cdf_data(
                    sell_cancels_relative)
                discrete_distributions["sell_cancels_relative"] = {
                    'x': sell_cancels_x.tolist(),
                    'cy': sell_cancels_cy.tolist()
                }
                Sample.plot_cdf(sell_cancels_x, sell_cancels_cy,
                                "Sell cancel prices (relative)")

                # Market orders
                market_orders = DataSplitter.get_market_orders_from_feed(
                    orders_df)

                # Buy market order sizes
                buy_market_sizes = DataSplitter.get_side(
                    "buy",
                    market_orders)['size'].dropna().apply(lambda x: abs(x))
                buy_market_sizes_x, buy_market_sizes_cy = Sample.get_cdf_data(
                    buy_market_sizes)
                discrete_distributions["buy_market_size"] = \
                    {'x': buy_market_sizes_x.tolist(), 'cy': buy_market_sizes_cy.tolist()}
                Sample.plot_cdf(buy_market_sizes_x, buy_market_sizes_cy,
                                "Buy market order sizes")

                # Sell market order sizes
                sell_market_sizes = DataSplitter.get_side(
                    "sell",
                    market_orders)['size'].dropna().apply(lambda x: abs(x))
                sell_market_sizes_x, sell_market_sizes_cy = Sample.get_cdf_data(
                    sell_market_sizes)
                discrete_distributions["sell_market_size"] = \
                    {'x': sell_market_sizes_x.tolist(), 'cy': sell_market_sizes_cy.tolist()}
                Sample.plot_cdf(sell_market_sizes_x, sell_market_sizes_cy,
                                "Sell market order sizes")

                # Find distributions using different procs
                # relative_order_price_distributions = pool.schedule(DataTransformer.price_distributions,
                #                                                    (trades_df, orders_df,),
                #                                                    dict(relative=True, graph=graph))

                # Buy/sell Price
                # order_price_distributions = pool.schedule(DataTransformer.price_distributions,
                #                                           (trades_df, orders_df,),
                #                                           dict(relative=False, graph=True))

                # Buy/sell price Cancellation
                # relative_cancel_price_distributions = pool.schedule(DataTransformer.price_distributions,
                #                                                     (trades_df, cancels_df,))

                # Limit Order Size
                limit_orders = DataSplitter.get_limit_orders_from_feed(
                    orders_df)

                buy_limit_orders_size = DataSplitter.get_side(
                    "buy",
                    limit_orders)['size'].dropna().apply(lambda x: abs(x))
                buy_limit_order_sizes_x, buy_limit_order_sizes_cy = Sample.get_cdf_data(
                    buy_limit_orders_size)
                discrete_distributions["buy_limit_size"] = \
                    {'x': buy_limit_order_sizes_x.tolist(), 'cy': buy_limit_order_sizes_cy.tolist()}
                Sample.plot_cdf(buy_limit_order_sizes_x,
                                buy_limit_order_sizes_cy,
                                "Buy limit order sizes")

                sell_limit_orders_size = DataSplitter.get_side(
                    "sell",
                    limit_orders)['size'].dropna().apply(lambda x: abs(x))
                sell_limit_order_sizes_x, sell_limit_order_sizes_cy = Sample.get_cdf_data(
                    sell_limit_orders_size)
                discrete_distributions["sell_limit_size"] = \
                    {'x': sell_limit_order_sizes_x.tolist(), 'cy': sell_limit_order_sizes_cy.tolist()}
                Sample.plot_cdf(sell_limit_order_sizes_x,
                                sell_limit_order_sizes_cy,
                                "Sell limit order sizes")

                intervals = DataTransformer.get_time_intervals(orders_df)
                intervals_x, intervals_cy = Sample.get_cdf_data(intervals)
                discrete_distributions["intervals"] = \
                    {'x': intervals_x.tolist(), 'cy': intervals_cy.tolist()}
                Sample.plot_cdf(intervals_x, intervals_cy, "Order intervals")

                # buy_limit_size = pool.schedule(DistributionFitter.best_fit_distribution,
                #                                (buy_limit_orders['size'],))
                # sell_limit_size = pool.schedule(DistributionFitter.best_fit_distribution,
                #                                 (sell_limit_orders['size'],))

                # Market Order Size

                # market_orders = DataSplitter.get_market_orders(orders_df)
                # buy_market_orders = DataSplitter.get_side("buy", market_orders)
                # sell_market_orders = DataSplitter.get_side("sell", market_orders)

                # buy_market_size = pool.schedule(DistributionFitter.best_fit_distribution,
                #                                (buy_market_orders['size'],))
                # sell_market_size = pool.schedule(DistributionFitter.best_fit_distribution,
                #                                 (sell_market_orders['size'],))

                # intervals = pool.schedule(DataTransformer.intervals_distribution, (orders_df,))

                ratios["buy_sell_order_ratio"] = Statistics.get_buy_sell_ratio(
                    orders_df)
                ratios[
                    "buy_sell_cancel_ratio"] = Statistics.get_buy_sell_ratio(
                        cancels_df)
                ratios[
                    "buy_sell_volume_ratio"] = Statistics.get_buy_sell_volume_ratio(
                        orders_df)
                ratios[
                    'limit_market_order_ratio'] = Statistics.get_limit_market_order_ratio(
                        orders_df)

                # Buy/sell Price relative
                # distributions["buy_price_relative"] = relative_order_price_distributions.result()["buy"][1]
                # distributions["sell_price_relative"] = relative_order_price_distributions.result()["sell"][1]

                # distributions["buy_price"] = order_price_distributions.result()["buy"][1]
                # distributions["sell_price"] = order_price_distributions.result()["sell"][1]

                # distributions["buy_cancel_price"] = relative_cancel_price_distributions.result()["buy"][1]
                # distributions["sell_cancel_price"] = relative_cancel_price_distributions.result()["sell"][1]

                # buy_limit_size_best_fit, buy_limit_size_best_fit_params = buy_limit_size.result()
                # _, distributions["buy_limit_size"] = DistributionFitter.get_distribution_string(buy_limit_size_best_fit,
                #                                                                                 buy_limit_size_best_fit_params)
                #
                # sell_limit_size_best_fit, sell_limit_size_best_fit_params = sell_limit_size.result()
                # _, distributions["sell_limit_size"] = DistributionFitter.get_distribution_string(sell_limit_size_best_fit,
                #                                                                                  sell_limit_size_best_fit_params)

                # buy_market_size_best_fit, buy_market_size_best_fit_params = buy_market_size.result()
                # _, distributions["buy_market_size"] = DistributionFitter.get_distribution_string(buy_market_size_best_fit,
                #                                                                                  buy_market_size_best_fit_params)
                #
                # sell_market_size_best_fit, sell_market_size_best_fit_params = sell_market_size.result()
                # _, distributions["sell_market_size"] = DistributionFitter.get_distribution_string(sell_market_size_best_fit,
                #                                                                                   sell_market_size_best_fit_params)

                # _, distributions["interval"] = intervals.result()

                params['ratios'] = ratios
                params['correlations'] = correlations
                params['distributions'] = distributions
                params['discreteDistributions'] = discrete_distributions

            return params
        except Exception as e:
            cls.logger.error("Failed to generate parameters, exception was " +
                             str(e))
            raise e
Ejemplo n.º 29
0
    def test_real_spread_plot(self):
        plt.figure(figsize=(12, 8))

        product = "LTC-USD"
        root = "/Users/jamesprince/project-data/data/consolidated-feed/"

        st = datetime.datetime(2018, 5, 17, 1, 0, 0)
        et = datetime.datetime(2018, 5, 17, 1, 5, 0)

        feed_df = DataLoader.load_feed(root + product + "/", st, et, product)

        conf = configparser.ConfigParser()
        conf.read("../config/backtest.ini")
        config = BacktestConfig(conf)

        ob_seq, ob_state = reconstruct_orderbook(config, st,
                                                 logging.getLogger("test"))

        orderbook_evo = OrderBookEvolutor(ob_state, st, ob_seq)
        res_df = orderbook_evo.evolve_orderbook(feed_df)

        res_df['seconds'] = (
            res_df['time'] -
            res_df['time'].iloc[0]).apply(lambda x: x.total_seconds())

        print(res_df)

        limit_orders = DataSplitter.get_limit_orders_from_feed(feed_df)
        limit_orders['seconds'] = (
            limit_orders['time'] -
            limit_orders['time'].iloc[0]).apply(lambda x: x.total_seconds())

        buy_limit_orders = DataSplitter.get_side("buy", limit_orders)
        sell_limit_orders = DataSplitter.get_side("sell", limit_orders)

        cancels = DataSplitter.get_cancellations(feed_df)

        # print(cancels)

        cancels_merged = cancels.merge(limit_orders, on='order_id', how='left')

        # print(cancels_merged)

        cancels_merged['price'] = cancels_merged['price_x']
        cancels_merged['side'] = cancels_merged['side_x']
        cancels_merged['seconds'] = (cancels_merged['time_x'] -
                                     cancels_merged['time_x'].iloc[0]
                                     ).apply(lambda x: x.total_seconds())

        cancels_merged['lifetime'] = abs(cancels_merged['time_x'] -
                                         cancels_merged['time_y']).dropna()

        print(cancels_merged)
        median_idx = int(len(cancels_merged['lifetime']) / 2)
        print(cancels_merged['lifetime'].sort_values().iloc[median_idx])

        buy_cancels = DataSplitter.get_side("buy", cancels_merged)
        sell_cancels = DataSplitter.get_side("sell", cancels_merged)

        plt.plot(buy_limit_orders['seconds'],
                 buy_limit_orders['price'],
                 'r+',
                 label="Buy limit orders")
        plt.plot(sell_limit_orders['seconds'],
                 sell_limit_orders['price'],
                 'b+',
                 label="Sell limit orders")

        # plt.plot(buy_cancels['seconds'], buy_cancels['price'], 'r+', label="Buy side cancels")
        # plt.plot(sell_cancels['seconds'], sell_cancels['price'], 'b+', label="Sell side cancels")

        plt.plot(res_df['seconds'], res_df['best_bid'], label='Best bid price')
        plt.plot(res_df['seconds'], res_df['best_ask'], label='Best ask price')

        start_price = res_df['midprice'].iloc[0]
        plt.ylim(start_price - 5, start_price + 5)

        plt.legend()
        plt.show()
Ejemplo n.º 30
0
    def test_filter_when_cutoff_before_end(self):

        assert len(DataSplitter.get_first_n_nanos(self.df, 5)['time']) == 1