Esempio n. 1
0
    def __init__(self, trade_order_list=None, bid_benchmark='mid', ask_benchmark='mid', benchmark_post_fix=''):
        super(BenchmarkArrival, self).__init__(trade_order_list=trade_order_list)

        self._bid_benchmark = bid_benchmark
        self._ask_benchmark = ask_benchmark
        self._time_series_ops = TimeSeriesOps()
        self._benchmark_name = 'arrival' + benchmark_post_fix
Esempio n. 2
0
    def _write_df_to_db_single_thread(self, ticker, remove_duplicates=True, if_exists_table='append',
                                      if_exists_ticker='replace'):

        logger = LoggerManager.getLogger(__name__)

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder, ticker + postfix) + '.' + fileformat

        logger.info("Reading " + filename)

        util_func = UtilFunc()
        time_series_ops = TimeSeriesOps()
        data_source_local = self._get_output_data_source()

        df = util_func.read_dataframe_from_binary(filename, format=binary_format)

        if df is not None:
            df = time_series_ops.localize_as_UTC(df)

            data_source_local.append_market_data(df, ticker, if_exists_table=if_exists_table,
                                                 if_exists_ticker=if_exists_ticker)
        else:
            logger.warn("Couldn't write dataframe for " + ticker + " to database, appears it is empty!")
Esempio n. 3
0
    def __init__(self, tables_dict={}, scalar=1, round_figures_by=None):
        self._tables_dict = tables_dict
        self._scalar = scalar
        self._round_figures_by = round_figures_by

        self._time_series_ops = TimeSeriesOps()
        self._util_func = UtilFunc()
Esempio n. 4
0
    def __init__(self,
                 temp_data_folder=constants.temp_data_folder,
                 temp_large_data_folder=constants.temp_large_data_folder,
                 tickers=None,
                 data_store=None):

        self.temp_data_folder = temp_data_folder
        self.temp_large_data_folder = temp_large_data_folder
        self.tickers = None
        self.util_func = UtilFunc()
        self.time_series_ops = TimeSeriesOps()
        self.data_store = data_store

        logger = LoggerManager().getLogger(__name__)

        if not (os.path.isdir(self.temp_data_folder)):
            logger.warn("Temp data folder " + self.temp_data_folder +
                        " does not exist")

        if not (os.path.isdir(self.temp_large_data_folder)):
            logger.warn("Temp large data folder " + self.temp_data_folder +
                        " does not exist")

        if tickers is not None:
            self.tickers = tickers
Esempio n. 5
0
    def split_date_single_list(self, start_date, finish_date, split_size ='yearly', add_partial_period_start_finish_dates=False):
        """From a start and finish date/time, create a range of dates at an annual, monthly or daily frequency. Also
        has the option of adding the start/finish dates, even if they are not strictly aligned to a frequency boundary.

        Parameters
        ----------
        start_date : Timestamp
            Start date/time

        finish_date : Timestamp
            Finish date/time

        split_size : str
            'yearly' - split into annual chunks
            'daily' - split into daily chunks
            'monthly' - split into monthly chunks

        add_partial_period_start_finish_dates : bool (default: False)
            Add the start and finish dates we originally specified even if they are not perfectly aligned to the periods

        Returns
        -------
        Timestamp (list)
        """
        from tcapy.util.timeseries import TimeSeriesOps

        start_date = TimeSeriesOps().date_parse(start_date)
        finish_date = TimeSeriesOps().date_parse(finish_date)

        if split_size == 'monthly':
            split_dates_freq = 'MS'

            dates = pd.date_range(start=start_date, end=finish_date, freq=split_dates_freq).tolist()

        elif split_size == 'daily':
            split_dates_freq = 'D'

            dates = pd.date_range(start=start_date, end=finish_date, freq=split_dates_freq).tolist()
        elif split_size == 'yearly':
            split_dates_freq = 'Y'

            dates = pd.date_range(start=start_date, end=finish_date, freq=split_dates_freq).tolist()
        else:
            dates = pd.date_range(start=start_date, end=finish_date, freq=split_size).tolist()
        #else:
        #    dates = [start_date, finish_date]

        #if len(dates) == 1:
        if add_partial_period_start_finish_dates:

            if len(dates) > 0:
                if start_date < dates[0]:
                    dates = self.flatten_list_of_lists([start_date, dates])

                if finish_date > dates[-1]:
                    dates = self.flatten_list_of_lists([dates, finish_date])
            else:
                dates = [start_date, finish_date]

        return dates
Esempio n. 6
0
    def __init__(self,
                 trade_order_list=None,
                 metric_name=None,
                 filter_by=['all'],
                 tag_value_combinations={},
                 keep_fields=['executed_notional', 'side'],
                 replace_text={},
                 round_figures_by=1,
                 scalar=1.0,
                 weighting_field=constants.table_weighting_field,
                 exclude_fields_from_avg=[]):
        self._trade_order_list = trade_order_list
        self._metric_name = metric_name
        self._results_summary = ResultsSummary()
        self._keep_fields = keep_fields
        self._filter_by = filter_by
        self._replace_text = replace_text
        self._round_figures_by = round_figures_by
        self._weighting_field = weighting_field
        self._scalar = scalar
        self._exclude_fields_from_avg = exclude_fields_from_avg

        self._tag_value_combinations = tag_value_combinations
        self._trade_order_filter_tag = TradeOrderFilterTag()
        self._results_form_tag = 'table'
        self._util_func = UtilFunc()
        self._time_series_ops = TimeSeriesOps()
Esempio n. 7
0
    def __init__(self,
                 market_data_postfix='dukascopy',
                 csv_market_data=None,
                 write_to_db=True,
                 sql_trade_database_type='ms_sql_server'):
        if csv_market_data is None:
            self._market_data_source = 'arctic-' + market_data_postfix
        else:
            self._market_data_source = csv_market_data

        self._tca_market = Mediator.get_tca_market_trade_loader()

        # Assumes MongoDB for tick data and MSSQL for trade/order data
        if write_to_db:
            self._database_source_market = DatabaseSourceArctic(
                postfix=market_data_postfix)  # market data source

            self._market_data_database_name = constants.arctic_market_data_database_name
            self._market_data_database_table = constants.arctic_market_data_database_table

            if sql_trade_database_type == 'ms_sql_server':
                self._database_source_trade = DatabaseSourceMSSQLServer(
                )  # trade data source
                self._trade_data_database_name = constants.ms_sql_server_trade_data_database_name
            elif sql_trade_database_type == 'mysql':
                self._database_source_trade = DatabaseSourceMySQL(
                )  # trade data source
                self._trade_data_database_name = constants.mysql_trade_data_database_name

        self.time_series_ops = TimeSeriesOps()
        self.rand_time_series = RandomiseTimeSeries()
Esempio n. 8
0
def test_remove_consecutive_duplicates():
    """Tests that consecutive duplicates are removed correctly in time series
    """
    dt = pd.date_range(start='01 Jan 2018', end='05 Jan 2018', freq='30s')

    df = pd.DataFrame(index=dt, columns=['bid', 'mid', 'ask'])

    df['mid'] = np.random.random(len(dt))
    df['bid'] = np.random.random(len(dt))
    df['ask'] = np.random.random(len(dt))

    # filter by 'mid'
    df2 = df.copy()

    df2.index = df2.index + timedelta(seconds=10)

    df_new = df.append(df2)
    df_new = df_new.sort_index()

    df_new = TimeSeriesOps().drop_consecutive_duplicates(df_new, 'mid')

    assert_frame_equal(df_new, df)

    # for 'bid' and 'ask'
    df2 = df.copy()

    df2.index = df2.index + timedelta(seconds=10)

    df_new = df.append(df2)
    df_new = df_new.sort_index()

    df_new = TimeSeriesOps().drop_consecutive_duplicates(
        df_new, ['bid', 'ask'])

    assert_frame_equal(df_new, df)
Esempio n. 9
0
class BenchmarkArrival(BenchmarkTrade):
    """For each trade dataframe, find the associated price associated with each trade arrival time in a market dataframe.
    Add as an 'arrival' column in the trade dataframe

    """
    def __init__(self,
                 trade_order_list=None,
                 bid_benchmark='mid',
                 ask_benchmark='mid',
                 benchmark_post_fix=''):
        super(BenchmarkArrival,
              self).__init__(trade_order_list=trade_order_list)

        self._bid_benchmark = bid_benchmark
        self._ask_benchmark = ask_benchmark
        self._time_series_ops = TimeSeriesOps()
        self._benchmark_name = 'arrival' + benchmark_post_fix

    def calculate_benchmark(self,
                            trade_order_df=None,
                            market_df=None,
                            trade_order_name=None,
                            bid_benchmark=None,
                            ask_benchmark=None):
        if not (self._check_calculate_benchmark(
                trade_order_name=trade_order_name)):
            return trade_order_df, market_df

        if bid_benchmark is None: bid_benchmark = self._bid_benchmark
        if ask_benchmark is None: ask_benchmark = self._ask_benchmark

        if bid_benchmark in market_df.columns and ask_benchmark in market_df.columns:
            trade_order_df[self._benchmark_name] = np.nan

            # Deal with all the buy trades (ie. buying at the ask!)
            is_side = trade_order_df['side'] == 1
            side_dt = trade_order_df.index[is_side]

            # TODO work on actual rather than copy
            benchmark, actual_dt = self._time_series_ops.vlookup_style_data_frame(
                side_dt, market_df, ask_benchmark)
            trade_order_df[self._benchmark_name][is_side] = benchmark

            # Now, do all the sell trades (ie. selling at the bid!)
            is_side = trade_order_df['side'] == -1
            side_dt = trade_order_df.index[is_side]

            benchmark, actual_dt = self._time_series_ops.vlookup_style_data_frame(
                side_dt, market_df, bid_benchmark)
            trade_order_df[self._benchmark_name][is_side] = benchmark

            # # find the nearest price as arrival
            # series, dt = self.time_series_ops.vlookup_style_data_frame(trade_order_df.index, market_df, field)
            #
            # trade_order_df['arrival'] = series

        return trade_order_df, market_df
Esempio n. 10
0
def test_time_delta():
    """Tests time delta function works for a number of different times"""
    td = TimeSeriesOps().get_time_delta("12:30")

    assert (td.seconds == 45000)

    td = TimeSeriesOps().get_time_delta("12:30:35")

    assert (td.seconds == 45035)

    print(td)
Esempio n. 11
0
    def __init__(self, trade_order_list=None, bid_benchmark='mid', ask_benchmark='mid', benchmark_post_fix='',
                 start_time_before_offset=None,
                 overwrite_time_of_day=None, overwrite_timezone=None):
        super(BenchmarkArrival, self).__init__(trade_order_list=trade_order_list)

        self._bid_benchmark = bid_benchmark
        self._ask_benchmark = ask_benchmark
        self._time_series_ops = TimeSeriesOps()
        self._benchmark_name = 'arrival' + benchmark_post_fix
        self._start_time_before_offset = start_time_before_offset
        self._overwrite_time_of_day = overwrite_time_of_day
        self._overwrite_timezone = overwrite_timezone
Esempio n. 12
0
def test_vlookup():
    """Runs a test for the VLOOKUP function which is used extensively in a lot of the metric construction
    """

    dt = pandas.date_range(start='01 Jan 2018', end='05 Jan 2018', freq='1min')

    rand_data = numpy.random.random(len(dt))

    df_before = pandas.DataFrame(index=dt, columns=['rand'], data=rand_data)

    millseconds_tests = [100, 500]

    # try perturbing by nothing, then 100 and 500 milliseconds
    for millseconds in millseconds_tests:
        df_perturb = pandas.DataFrame(index=dt -
                                      timedelta(milliseconds=millseconds),
                                      columns=['rand'],
                                      data=rand_data)

        # do a VLOOKUP (which should give us all the previous ones) - take off the last point (which would be AFTER
        # our perturbation)
        search, dt_search = TimeSeriesOps().vlookup_style_data_frame(
            dt[0:-1], df_perturb, 'rand')

        df_after = pandas.DataFrame(index=dt_search +
                                    timedelta(milliseconds=millseconds),
                                    data=search.values,
                                    columns=['rand'])

        # check the search dataframes are equal
        assert_frame_equal(df_before[0:-1], df_after, check_dtype=False)

    # in this case, our lookup series doesn't overlap at all with our range, so we should get back and exception
    dt_lookup = pandas.date_range(start='30 Dec 2017',
                                  end='31 Dec 2018',
                                  freq='1min')

    df_perturb = pandas.DataFrame(index=dt +
                                  timedelta(milliseconds=millseconds),
                                  columns=['rand'],
                                  data=rand_data)

    exception_has_been_triggered = False

    try:
        search, dt_search = TimeSeriesOps().vlookup_style_data_frame(
            dt_lookup, df_perturb, 'rand')
    except ValidationException:
        exception_has_been_triggered = True

    assert (exception_has_been_triggered)
Esempio n. 13
0
    def __init__(self, version=constants.tcapy_version, volatile_cache_engine=constants.volatile_cache_engine):
        self._data_factory = DataFactory(version=version)

        self._util_func = UtilFunc()  # general utility operations (such as flatten lists)
        self._fx_conv = FXConv()  # for determining if FX crosses are in the correct convention
        self._time_series_ops = TimeSeriesOps()  # time series operations, such as filtering by date

        self._metric_executed_price = MetricExecutedPriceNotional()  # for determining the executed notionals/price of orders
        # from trades

        self._benchmark_mid = BenchmarkMid()  # to calculate mid price from bid/ask quote market data
        self._trade_order_tag = TradeOrderFilterTag()  # to filter trade/orders according to the values of certain tags
        self._version = version
        self._volatile_cache_engine = volatile_cache_engine
Esempio n. 14
0
    def normalize_trade_data(self, df, dataset, data_request):

        if df is None: return None

        # For cancelled trades the trade price might be recorded as "zero" or a negative price, which is invalid, make these NaNs
        if 'executed_price' in df.columns:
            # df['executed_price'][df['executed_price'] <= 0] = np.nan
            df.loc[df['executed_price'] <= 0, 'executed_price'] = np.nan

        # Rename fields if necessary
        if 'executed_notional_currency' in df.columns:
            df = df.rename(
                columns={'executed_notional_currency': 'notional_currency'})

        # Convert buy/sell to -1/+1

        # TODO do regex/case insensitive version
        df['side'].replace('buy', 1, inplace=True)
        df['side'].replace('sell', -1, inplace=True)
        df['side'].replace('Buy', 1, inplace=True)
        df['side'].replace('Sell', -1, inplace=True)
        df['side'].replace('BUY', 1, inplace=True)
        df['side'].replace('SELL', -1, inplace=True)

        if 'event_type' in df.columns:
            df['event_type'].replace('execution', 'trade', inplace=True)

        # Also assume selected date columns are UTC (eg. benchmark start and finish dates for the orders)
        df = TimeSeriesOps().localize_cols_as_UTC(df,
                                                  constants.date_columns,
                                                  index=True).sort_index()

        df = self.offset_data_ms(df, data_request)

        return df
Esempio n. 15
0
    def get_time_series_ops():
        with Mediator._time_series_ops_lock:

            if Mediator._time_series_ops is None:
                Mediator._time_series_ops = TimeSeriesOps()

        return Mediator._time_series_ops
Esempio n. 16
0
    def calculate_benchmark(self,
                            market_df=None,
                            time_of_day=None,
                            day_of_week=None,
                            month_of_year=None,
                            year=None,
                            specific_dates=None,
                            time_zone=None):

        if self._check_empty_benchmark_market_data(market_df): return market_df

        if time_of_day is None: time_of_day = self._time_of_day
        if day_of_week is None: day_of_week = self._day_of_week
        if month_of_year is None: month_of_year = self._month_of_year
        if year is None: year = self._year
        if specific_dates is None: specific_dates = self._specific_dates
        if time_zone is None: time_zone = self._time_zone

        market_df = TimeSeriesOps(
        ).filter_time_series_by_multiple_time_parameters(
            market_df,
            time_of_day=time_of_day,
            day_of_week=day_of_week,
            month_of_year=month_of_year,
            year=year,
            specific_dates=specific_dates,
            time_zone=time_zone)

        return market_df
Esempio n. 17
0
    def __init__(self, version=constants.tcapy_version):
        self._util_func = UtilFunc()

        self._tca_market_trade_loader = Mediator.get_tca_market_trade_loader(version=version)
        self._time_series_ops = TimeSeriesOps()
        self._trade_order_tag = TradeOrderFilterTag()

        logger = LoggerManager.getLogger(__name__)
        logger.info("Init TCAEngine version: " + self._tca_market_trade_loader.get_tca_version() + " - Env: " + constants.env)
Esempio n. 18
0
def test_overwrite_time_in_datetimeindex():
    """Tests that overwriting the time with a specific time of day works
    """
    # Clocks went forward in London on 00:00 31 Mar 2020
    datetimeindex = pd.date_range('28 Mar 2020', '05 Apr 2020', freq='h')
    datetimeindex = datetimeindex.tz_localize("utc")

    datetimeindex = TimeSeriesOps().overwrite_time_of_day_in_datetimeindex(
        datetimeindex, "16:00", overwrite_timezone="Europe/London")

    # Back in UTC time 16:00 LDN is 15:00 UTC after DST changes (and is 16:00 UTC beforehand)
    assert datetimeindex[0].hour == 16 and datetimeindex[-1].hour == 15
Esempio n. 19
0
    def set_trade_order_params(self, tca_request=None, time_of_day=None,
                               day_of_week=None, month_of_year=None, year=None, specific_dates=None,
                               time_zone='utc'):
        """Initialise our filter, by the times of day, days of the week and months we wish to filter our trade/filters by.
        Note that it is optional which period to filter by (eg. we can filter just by time of day if we want to).

        Parameters
        ----------
        tca_request : TCARequest
            TCA parameters for our analysis

        time_of_day : dict
            Describing the start and finish time of our filter

        day_of_week : str (list)
            Which day of the week to filter by?

        month_of_year : str (list)
            Which month of the of the _year to filter by?

        year : int (list)
            Which _year to filter by

        specific_dates : str / str (list)
            Which dates to filter by

        time_zone : str
            Time zone to use (eg. 'utc')
        """

        self._tca_request = tca_request
        self._time_of_day = time_of_day
        self._day_of_week = day_of_week
        self._month_of_year = month_of_year
        self._year = year
        self._specific_dates = specific_dates
        self._time_zone = time_zone

        self._util_func = UtilFunc()
        self._time_series_ops = TimeSeriesOps()
Esempio n. 20
0
def test_append_market_data_arctic():
    """Tests we can append market data to arctic (we will have already written data to the test harness database)
    """
    if not (run_arctic_tests): return

    market_loader = Mediator.get_tca_market_trade_loader(version=tcapy_version)

    ### Test we can append (non-overlapping) data to Arctic
    arctic_start_date = '01 Jan 2016'; arctic_finish_date = pd.Timestamp(datetime.datetime.utcnow())

    # use this market request later when reading back from Arctic
    market_request = MarketRequest(start_date=arctic_start_date, finish_date=arctic_finish_date, ticker=ticker,
                                   data_store=test_harness_arctic_market_data_store,
                                   market_data_database_table=test_harness_arctic_market_data_table)

    # load data from CSV for comparison later
    database_source_csv = DatabaseSourceCSV(market_data_database_csv=csv_market_data_store)

    market_df_csv = database_source_csv.fetch_market_data(
        start_date=arctic_start_date, finish_date=arctic_finish_date, ticker=ticker)

    market_df_list = TimeSeriesOps().split_array_chunks(market_df_csv, chunks=2)

    for a in arctic_lib_type:

        database_source = DatabaseSourceArctic(postfix='testharness', arctic_lib_type=a)

        market_df_lower = market_df_list[0];
        market_df_higher = market_df_list[1]

        database_source.append_market_data(market_df_lower, ticker, table_name=test_harness_arctic_market_data_table,
                                           if_exists_table='replace', if_exists_ticker='replace', remove_duplicates=False)

        overlap_error = False

        ## Try to append overlapping data (this will fail!)
        try:
            database_source.append_market_data(market_df_lower, ticker,
                                               table_name=test_harness_arctic_market_data_table,
                                               if_exists_table='append', if_exists_ticker='append', remove_duplicates=False)
        except ErrorWritingOverlapDataException as e:
            overlap_error = True

        assert overlap_error

        # Append non-overlapping data which follows (writing overlapping data into Arctic will mess up the datastore!)
        database_source.append_market_data(market_df_higher, ticker, table_name=test_harness_arctic_market_data_table,
                                           if_exists_table='append', if_exists_ticker='append', remove_duplicates=False)

        market_df_all_read_back = market_loader.get_market_data(market_request=market_request)

        assert all(market_df_all_read_back['mid'] - market_df_csv['mid'] < eps)
Esempio n. 21
0
def test_append_market_data_db():
    """Tests we can append market data to KDB/InfluxDB.
    """
    database_source_list, test_harness_market_data_table_list, test_harness_data_store_list = _get_db_database_source()

    market_loader = Mediator.get_tca_market_trade_loader(version=tcapy_version)

    for i in range(0, len(database_source_list)):

        database_source = database_source_list[i]
        test_harness_market_data_table = test_harness_market_data_table_list[i]
        test_harness_data_store = test_harness_data_store_list[i]

        ### Test we can append (non-overlapping) data to KDB/InfluxDB
        db_start_date = '01 Jan 2016'; db_finish_date = pd.Timestamp(datetime.datetime.utcnow())

        # TODO
        market_request = MarketRequest(start_date=db_start_date, finish_date=db_finish_date, ticker=ticker,
                                       data_store=test_harness_data_store, market_data_database_table=test_harness_market_data_table)

        market_df_load = market_loader.get_market_data(market_request=market_request)

        market_df_list = TimeSeriesOps().split_array_chunks(market_df_load, chunks=2)

        market_df_lower = market_df_list[0];
        market_df_higher = market_df_list[1]

        database_source.append_market_data(market_df_lower, ticker, table_name=test_harness_market_data_table,
                                           if_exists_table='replace', if_exists_ticker='replace', remove_duplicates=False)

        overlap_error = False

        ## try to append overlapping data (this will fail!)
        try:
            database_source.append_market_data(market_df_lower, ticker,
                                               table_name=test_harness_market_data_table,
                                               if_exists_table='append', if_exists_ticker='append', remove_duplicates=False)
        except ErrorWritingOverlapDataException as e:
            overlap_error = True

        assert overlap_error

        # append non-overlapping data which follows (writing overlapping data can end up with duplicated values - although
        # KDB/InfluxDB will allow this)
        database_source.append_market_data(market_df_higher, ticker, table_name=test_harness_market_data_table,
                                           if_exists_table='append', if_exists_ticker='append', remove_duplicates=False)

        market_df_all_read_back = market_loader.get_market_data(market_request=market_request)

        assert all(market_df_all_read_back['mid'] - market_df_load['mid'] < eps)
Esempio n. 22
0
    def normalize_market_data(self, df, dataset, data_request):
        df = TimeSeriesOps().localize_as_UTC(df)

        # For each dataset have a different field mapping (get field mapping for that dataset from stored CSV files)

        # Convert vendor specific field names to the Cuemacro names

        # Convert vendor specific asset names (eg. GBP=) to Cuemacro standard names (GBPUSD)

        # The dataset is very dense, we assume it is stored on disk ordered (Arctic only allows this)
        # df = df.sort_index()

        df = self.offset_data_ms(df, data_request)

        return df
Esempio n. 23
0
def test_chunk():
    """Tests the chunking of dataframes works
    """
    dt = pd.date_range(start='01 Jan 2018', end='05 Jan 2018', freq='1min')

    df = pd.DataFrame(index=dt, columns=['bid', 'mid', 'ask'])

    df['mid'] = np.random.random(len(dt))

    df_chunk = TimeSeriesOps().split_array_chunks(df,
                                                  chunks=None,
                                                  chunk_size=100)
    df_chunk = pd.concat(df_chunk)

    assert_frame_equal(df_chunk, df)
Esempio n. 24
0
def test_ohlc():
    """Tests the open/high/low/close resampling works on time series
    """
    dt = pd.date_range(start='01 Jan 2018', end='05 Jan 2018', freq='1s')

    df = pd.DataFrame(index=dt, columns=['bid', 'mid', 'ask'])

    df['mid'] = np.random.random(len(dt))

    df_ohlc = TimeSeriesOps().resample_time_series(df,
                                                   resample_amount=1,
                                                   how='ohlc',
                                                   unit='minutes',
                                                   field='mid')

    assert all(df_ohlc['high'] >= df_ohlc['low'])
Esempio n. 25
0
    def __init__(self,
                 trade_order_list=None,
                 metric_name=None,
                 aggregate_by_field=None,
                 aggregation_metric='mean',
                 tag_value_combinations={}):
        self._trade_order_list = trade_order_list
        self._metric_name = metric_name
        self._aggregate_by_field = aggregate_by_field
        self._aggregation_metric = aggregation_metric
        self._results_summary = ResultsSummary()

        self._tag_value_combinations = tag_value_combinations
        self._trade_order_filter_tag = TradeOrderFilterTag()
        self._util_func = UtilFunc()
        self._time_series_ops = TimeSeriesOps()
Esempio n. 26
0
def test_data_frame_holder():
    """Tests the storing of DataFrameHolder object which is like an enhanced dict specifically for storing DataFrames,
    alongside using the VolatileCache
    """
    from tcapy.analysis.dataframeholder import DataFrameHolder
    from tcapy.data.volatilecache import VolatileRedis as VolatileCache
    volatile_cache = VolatileCache()

    # Create a very large DataFrame, which needs to be chunked in storage
    dt = pd.date_range(start='01 Jan 2000', end='05 Jan 2020', freq='10s')
    df = pd.DataFrame(index=dt, columns=['bid', 'mid', 'ask'])

    df['bid'] = np.ones(len(dt))
    df['mid'] = np.ones(len(dt))
    df['ask'] = np.ones(len(dt))

    df_list = TimeSeriesOps().split_array_chunks(df, chunks=2)
    df_lower = df_list[0]
    df_higher = df_list[1]

    for i in ['_comp', '']:
        df_holder = DataFrameHolder()

        df_holder.add_dataframe(
            volatile_cache.put_dataframe_handle(df_lower,
                                                use_cache_handles=True),
            'EURUSD_df' + i)
        df_holder.add_dataframe(
            volatile_cache.put_dataframe_handle(df_higher,
                                                use_cache_handles=True),
            'EURUSD_df' + i)

        df_dict = df_holder.get_combined_dataframe_dict()

        df_final = df_dict['EURUSD_df' + i]

    assert_frame_equal(df, df_final)
Esempio n. 27
0
def test_write_multiple_wildcard_market_data_csvs_arctic():
    """Tests we can write sequential market data CSVs (or HDF5) whose path has been specified by a wildcard (eg. EURUSD*.csv).
    It is assumed that the CSVs are in chronological orders, from their filenames.
    """
    if not (run_arctic_tests): return

    market_loader = Mediator.get_tca_market_trade_loader(version=tcapy_version)

    arctic_start_date = '01 Jan 2016'
    arctic_finish_date = pd.Timestamp(datetime.datetime.utcnow())

    for a in arctic_lib_type:
        database_source = DatabaseSourceArctic(postfix='testharness',
                                               arctic_lib_type=a)

        ### Read CSV data which is sorted ascending (default!)
        database_source.convert_csv_to_table(
            csv_market_data_store,
            ticker,
            test_harness_arctic_market_data_table,
            if_exists_table='replace',
            if_exists_ticker='replace',
            market_trade_data='market',
            csv_read_chunksize=10**6,
            remove_duplicates=False)

        database_source_csv = DatabaseSourceCSV(
            market_data_database_csv=csv_market_data_store)

        market_df_csv = database_source_csv.fetch_market_data(
            start_date=arctic_start_date,
            finish_date=arctic_finish_date,
            ticker=ticker)

        # Prepare the CSV folder first
        csv_folder = os.path.join(constants.test_data_harness_folder,
                                  'csv_arctic_mult')

        # Empty the CSV test harness folder, where we shall dump the mini CSVs
        UtilFunc().forcibly_create_empty_folder(csv_folder)

        # Split the CSV file into several mini CSV files (and also HDF5 files)
        market_df_list = TimeSeriesOps().split_array_chunks(market_df_csv,
                                                            chunks=3)

        chunk_no = 0

        for m in market_df_list:
            m.to_csv(
                os.path.join(csv_folder, "EURUSD" + str(chunk_no) + '.csv'))
            UtilFunc().write_dataframe_to_binary(
                m,
                os.path.join(csv_folder,
                             "EURUSD" + str(chunk_no) + '.parquet'),
                format='parquet')

            chunk_no = chunk_no + 1

        file_ext = ['csv', 'parquet']

        for f in file_ext:
            ### Read CSV data from the mini CSVs (using wildcard char) and dump to Arctic
            database_source.convert_csv_to_table(
                os.path.join(csv_folder, "EURUSD*." + f),
                ticker,
                test_harness_arctic_market_data_table,
                if_exists_table='append',
                if_exists_ticker='replace',
                market_trade_data='market',
                csv_read_chunksize=10**6,
                remove_duplicates=False)

            market_request = MarketRequest(
                start_date=arctic_start_date,
                finish_date=arctic_finish_date,
                ticker=ticker,
                data_store=database_source,
                market_data_database_table=test_harness_arctic_market_data_table
            )

            # Read back from Arctic
            market_df_load = market_loader.get_market_data(
                market_request=market_request)

            # Compare reading directly from the original large CSV vs. reading back from arctic (which was dumped from split CSVs)
            diff_df = abs(market_df_load['mid'] - market_df_csv['mid'])

            outside_bounds = diff_df[diff_df >= eps]

            assert len(outside_bounds) == 0
Esempio n. 28
0
class BenchmarkArrival(BenchmarkTrade):
    """For each trade/order DataFrame, finds the associated price associated with each trade arrival time in a market dataframe.
    Adds as an 'arrival' column in the trade/order DataFrame

    """

    def __init__(self, trade_order_list=None, bid_benchmark='mid', ask_benchmark='mid', benchmark_post_fix='',
                 start_time_before_offset=None,
                 overwrite_time_of_day=None, overwrite_timezone=None):
        super(BenchmarkArrival, self).__init__(trade_order_list=trade_order_list)

        self._bid_benchmark = bid_benchmark
        self._ask_benchmark = ask_benchmark
        self._time_series_ops = TimeSeriesOps()
        self._benchmark_name = 'arrival' + benchmark_post_fix
        self._start_time_before_offset = start_time_before_offset
        self._overwrite_time_of_day = overwrite_time_of_day
        self._overwrite_timezone = overwrite_timezone

    def calculate_benchmark(self, trade_order_df=None, market_df=None, trade_order_name=None, bid_benchmark=None,
                            ask_benchmark=None, start_time_before_offset=None, overwrite_time_of_day=None, overtime_zone=None):
        if self._check_empty_benchmark_market_trade_data(trade_order_name, trade_order_df, market_df):
            return trade_order_df, market_df

        if bid_benchmark is None: bid_benchmark = self._bid_benchmark
        if ask_benchmark is None: ask_benchmark = self._ask_benchmark
        if start_time_before_offset is None: start_time_before_offset = self._start_time_before_offset
        if overwrite_time_of_day is None: overwrite_time_of_day = self._overwrite_time_of_day
        if overtime_zone is None: overtime_zone = self._overwrite_time_of_day

        if bid_benchmark in market_df.columns and ask_benchmark in market_df.columns:
            trade_order_df[self._benchmark_name] = np.nan

            # Deal with all the buy trades (ie. buying at the ask!)
            is_side = trade_order_df['side'] == 1
            side_dt = trade_order_df.index[is_side]

            if start_time_before_offset is not None:
                side_dt = side_dt - self._time_series_ops.get_time_delta(start_time_before_offset)

            if overwrite_time_of_day is not None:
                side_dt = self._time_series_ops.overwrite_time_of_day_in_datetimeindex(side_dt, overwrite_time_of_day,
                            old_tz=trade_order_df.index.tz, overwrite_timezone=overtime_zone)

            # TODO work on actual rather than copy
            benchmark, actual_dt = self._time_series_ops.vlookup_style_data_frame(side_dt, market_df, ask_benchmark)
            trade_order_df[self._benchmark_name][is_side] = benchmark

            # Now, do all the sell trades (ie. selling at the bid!)
            is_side = trade_order_df['side'] == -1
            side_dt = trade_order_df.index[is_side]

            # Offset time and then overwrite if specified by user
            if start_time_before_offset is not None:
                side_dt = side_dt - self._time_series_ops.get_time_delta(start_time_before_offset)

            if overwrite_time_of_day is not None:
                side_dt = self._time_series_ops.overwrite_time_of_day_in_datetimeindex(side_dt, overwrite_time_of_day,
                            old_tz=trade_order_df.index.tz, overwrite_timezone=overtime_zone)

            benchmark, actual_dt = self._time_series_ops.vlookup_style_data_frame(side_dt, market_df, bid_benchmark)
            trade_order_df[self._benchmark_name][is_side] = benchmark

            # # find the nearest price as arrival
            # series, dt = self._time_series_ops.vlookup_style_data_frame(market_trade_order_df.index, market_df, field)
            #
            # market_trade_order_df['arrival'] = series

        return trade_order_df, market_df
Esempio n. 29
0
    def _combine_mini_df_from_disk_single_thread(self,
                                                 ticker,
                                                 remove_duplicates=True):

        logger = LoggerManager.getLogger(__name__)
        time_series_ops = TimeSeriesOps()

        logger.info('Getting ' + ticker + ' filenames...')
        temp_data_folder = self.temp_data_folder

        filename_list = []

        for root, dirnames, filenames in os.walk(temp_data_folder):

            for filename in filenames:
                if ticker in filename and '.' + fileformat in filename:
                    filename_h5_parquet = os.path.join(root, filename)

                    # if filename is less than 10MB add (otherwise likely a very large aggregated file!)
                    if os.path.getsize(filename_h5_parquet) < 10 * 1024 * 1024:
                        filename_list.append(filename_h5_parquet)

        df_list = []

        util_func = UtilFunc()

        logger.info('Loading ' + ticker + ' mini dataframe into  memory')

        i = 0

        if len(filename_list) == 0:
            logger.warn("Looks like there are no files for " + ticker +
                        " in " + temp_data_folder +
                        ". Are you sure path is correct?")

        # Go through each mini file which represents a few minutes of data and append it
        for filename in filename_list:
            filesize = 0

            try:
                filesize = os.path.getsize(filename) / 1024.0
                df = util_func.read_dataframe_from_binary(filename,
                                                          format=binary_format)

                i = i + 1

                # every 100 files print reading output@
                if i % 100 == 0:
                    logger.info('Reading ' + filename + ' number ' + str(i))

                if df is not None:
                    df = df.sort_index()
                    df = self._remove_duplicates_time_series(df,
                                                             remove_duplicates,
                                                             time_series_ops,
                                                             field='mid')

                    df_list.append(df)
            except Exception as e:
                logger.warn('Failed to parse ' + filename + " of " +
                            str(filesize) + "KB")  # + str(e))

            # if i > 1000:
            #    break

        # Assume UTC time (don't want to mix UTC and non-UTC in database!)
        if df_list == []:
            logger.warn('No dataframe read for ' + ticker +
                        ', cannot combine!')

            return

        logger.info('About to combine ' + ticker +
                    ' into large dataframe to write to disk...')

        df = pd.concat(df_list)
        df = time_series_ops.localize_as_UTC(df)

        df = df.sort_index()

        df = self._remove_duplicates_time_series(df,
                                                 remove_duplicates,
                                                 time_series_ops,
                                                 field='mid')

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder,
                                ticker + postfix) + '.' + fileformat

        df = time_series_ops.localize_as_UTC(df)
        util_func.write_dataframe_to_binary(df, filename, format=binary_format)
Esempio n. 30
0
class TCATickerLoader(ABC):
    """This class is designed to load up market and trade data for single _tickers and also makes appropriate metric calculations
    for that specific ticker. It is generally called by the higher level TCAMarketTradeLoader class, which can handle multiple _tickers.

    """
    def __init__(self,
                 version=constants.tcapy_version,
                 volatile_cache_engine=constants.volatile_cache_engine):
        self._data_factory = DataFactory(version=version)

        self._util_func = UtilFunc(
        )  # general utility operations (such as flatten lists)
        self._fx_conv = FXConv(
        )  # for determining if FX crosses are in the correct convention
        self._time_series_ops = TimeSeriesOps(
        )  # time series operations, such as filtering by date

        self._metric_executed_price = MetricExecutedPriceNotional(
        )  # for determining the executed notionals/price of orders
        # from trades

        self._benchmark_mid = BenchmarkMarketMid(
        )  # to calculate mid price from bid/ask quote market data
        self._trade_order_tag = TradeOrderFilterTag(
        )  # to filter trade/orders according to the values of certain tags
        self._version = version
        self._volatile_cache_engine = volatile_cache_engine

    def get_market_data(self, market_request):
        """Gets market data for a particular ticker. When we ask for non-standard FX crosses, only the mid-field is
        returned (calculated as a cross rate). We do not give bid/ask quotes for calculated non-standard _tickers, as these
        can difficult to estimate.

        Parameters
        ----------
        market_request : MarketRequest
            The type of market data to get

        Returns
        -------
        DataFrame
        """
        logger = LoggerManager.getLogger(__name__)

        if isinstance(market_request, TCARequest):
            market_request = MarketRequest(market_request=market_request)

        old_ticker = market_request.ticker

        if market_request.asset_class == 'fx':
            # Check if we can get ticker directly or need to create synthetic cross rates
            ticker = self._fx_conv.correct_notation(market_request.ticker)
        else:
            # If not FX we don't have to invert
            ticker = old_ticker

        # If ticker is in the correct convention is in crosses where we collect data (typically this will be the USD
        # crosses, also some liquid non-USD pairs like EURJPY)

        # available_tickers = []

        if isinstance(market_request.data_store, DatabaseSource):
            # TODO improve ticker check here!
            available_tickers = [ticker]
        elif 'csv' in market_request.data_store or 'h5' in market_request.data_store or 'gzip' in market_request.data_store \
            or 'parquet' in market_request.data_store or isinstance(market_request.data_store, pd.DataFrame) :

            # For CSV (or H5) we don't have much choice, and could differ between CSV files (if CSV has 'ticker' field, will
            # match on that)
            available_tickers = [ticker]
        elif market_request.data_store in constants.market_data_tickers:
            available_tickers = self._util_func.dict_key_list(
                constants.market_data_tickers[
                    market_request.data_store].keys())

        else:
            err_msg = 'Ticker ' + str(
                ticker
            ) + " doesn't seem available in the data source " + market_request.data_store

            logger.error(err_msg)

            raise Exception(err_msg)

        if ticker in available_tickers:

            # In the correct convention or is not FX
            if ticker == old_ticker:
                market_df = self._get_correct_convention_market_data(
                    market_request)

            # Otherwise need to flip to the correct convention (only will return 'mid')
            else:
                market_request_flipped = MarketRequest(
                    market_request=market_request)
                market_request_flipped.ticker = ticker

                market_df = self._invert_quoting_market(
                    self._get_correct_convention_market_data(
                        market_request_flipped))

                if 'ticker' in market_df.columns:
                    market_df['ticker'] = old_ticker
        else:
            if market_request.asset_class == 'fx' and market_request.instrument == 'spot':
                # Otherwise we need to get both legs
                # eg. for NZDCAD, we shall download NZDUSD and USDCAD => multiply them to get NZDCAD

                # get the USD crosses for each leg and then multiply
                market_request_base = MarketRequest(
                    market_request=market_request)
                market_request_terms = MarketRequest(
                    market_request=market_request)

                market_request_base.ticker = old_ticker[0:3] + 'USD'
                market_request_terms.ticker = 'USD' + old_ticker[3:7]

                tickers_exist = self._fx_conv.currency_pair_in_list(
                        self._fx_conv.correct_notation(market_request_base.ticker), available_tickers) and \
                        self._fx_conv.currency_pair_in_list(
                            self._fx_conv.correct_notation(market_request_terms.ticker), available_tickers)

                # If both USD _tickers don't exist try computing via EUR _tickers? (eg. USDSEK from EURUSD & EURSEK)
                if not (tickers_exist):
                    market_request_base.ticker = old_ticker[0:3] + 'EUR'
                    market_request_terms.ticker = 'EUR' + old_ticker[3:7]

                    tickers_exist = self._fx_conv.currency_pair_in_list(
                        self._fx_conv.correct_notation(market_request_base.ticker), available_tickers) and \
                                    self._fx_conv.currency_pair_in_list(
                                        self._fx_conv.correct_notation(market_request_terms.ticker), available_tickers)

                # Check if that currency (in the CORRECT convention) is in the available _tickers
                # we will typically not collect market data for currencies in their wrong convention
                if tickers_exist:

                    fields_try = ['bid', 'ask', 'mid']

                    market_base_df = self.get_market_data(market_request_base)
                    market_terms_df = self.get_market_data(
                        market_request_terms)

                    market_has_data = False

                    if market_base_df is not None and market_terms_df is not None:
                        if not (market_base_df.empty) and not (
                                market_terms_df.empty):
                            market_has_data = True

                    # If there's no data in either DataFrame, don't attempt to calculate anything
                    if not (market_has_data):
                        return pd.DataFrame()

                    fields = []

                    for f in fields_try:
                        if f in market_base_df.columns and f in market_terms_df.columns:
                            fields.append(f)

                    # Only attempt to calculate if the fields exist
                    if len(fields) > 0:
                        # Remove any other columns (eg. with ticker name etc.)
                        market_base_df = market_base_df[fields]
                        market_terms_df = market_terms_df[fields]

                        # Need to align series to multiply (and then fill down points which don't match)
                        # can't use interpolation, given that would use FUTURE data
                        market_base_df, market_terms_df = market_base_df.align(
                            market_terms_df, join="outer")
                        market_base_df = market_base_df.fillna(method='ffill')
                        market_terms_df = market_terms_df.fillna(
                            method='ffill')

                        market_df = pd.DataFrame(data=market_base_df.values *
                                                 market_terms_df.values,
                                                 columns=fields,
                                                 index=market_base_df.index)

                        # Values at the start of the series MIGHT be nan, so need to ignore those
                        market_df = market_df.dropna(subset=['mid'])

                        if 'ticker' in market_df.columns:
                            market_df['ticker'] = old_ticker
                    else:
                        return None

                else:
                    # Otherwise couldn't compute either from the USD legs or EUR legs
                    logger.warning("Couldn't find market data for ticker: " +
                                   str(ticker))

                    return None
            else:
                # Otherwise couldn't find the non-FX ticker
                logger.warning("Couldn't find market data for ticker: " +
                               str(ticker))

                return None

        return market_df

    def get_trade_order_data(self,
                             tca_request,
                             trade_order_type,
                             start_date=None,
                             finish_date=None):
        """Gets trade data for specified parameters (eg. start/finish dates _tickers). Will also try to find trades
        when they have booked in the inverted market convention, and change the fields appropriately. For example, if
        we ask for GBPUSD trade data, it will also search for USDGBP and convert those trades in the correct convention.

        Parameters
        ----------
        tca_request : TCARequest
            What type of trade data do we want

        trade_order_type : str
            Do we want trade or order data?

        Returns
        -------
        DataFrame
        """
        logger = LoggerManager().getLogger(__name__)

        # by default, assume we want trade data (rather than order data)
        if trade_order_type is None:
            trade_order_type = 'trade_df'

        if start_date is None and finish_date is None:
            start_date = tca_request.start_date
            finish_date = tca_request.finish_date

        # Create request for actual executed trades
        trade_request = TradeRequest(trade_request=tca_request)

        trade_request.start_date = start_date
        trade_request.finish_date = finish_date
        trade_request.trade_order_type = trade_order_type

        # Fetch all the trades done in that ticker (will be sparse-like randomly spaced tick data)
        # assumed to be the correct convention (eg. GBPUSD)
        trade_df = self._data_factory.fetch_table(data_request=trade_request)

        # if fx see if inverted or not
        if tca_request.asset_class == 'fx' and tca_request.instrument == 'spot':
            # Also fetch data in the inverted cross (eg. USDGBP) as some trades may be recorded this way
            inv_trade_request = TradeRequest(trade_request=tca_request)

            inv_trade_request.start_date = start_date
            inv_trade_request.finish_date = finish_date
            inv_trade_request.trade_order_type = trade_order_type

            inv_trade_request.ticker = self._fx_conv.reverse_notation(
                trade_request.ticker)

            trade_inverted_df = self._data_factory.fetch_table(
                data_request=inv_trade_request)

            # Only add inverted trades if they exist!
            if trade_inverted_df is not None:
                if not (trade_inverted_df.empty):

                    invert_price_columns = [
                        'executed_price', 'price_limit', 'market_bid',
                        'market_mid', 'market_ask', 'arrival_price'
                    ]
                    invert_price_columns = [
                        x for x in invert_price_columns
                        if x in trade_inverted_df.columns
                    ]

                    # For trades (but not orders), there is an executed price field, which needs to be inverted
                    if invert_price_columns != []:
                        trade_inverted_df[
                            invert_price_columns] = 1.0 / trade_inverted_df[
                                invert_price_columns].values

                    trade_inverted_df['side'] = -trade_inverted_df[
                        'side']  # buys become sells, and vice versa!
                    trade_inverted_df['ticker'] = trade_request.ticker

                    if trade_df is not None:
                        trade_df = trade_df.append(trade_inverted_df)
                        trade_df = trade_df.sort_index()
                    else:
                        trade_df = trade_inverted_df

        # Check if trade data is not empty? if it is return None
        if self._check_is_empty_trade_order(trade_df, tca_request, start_date,
                                            finish_date, trade_order_type):
            return None

        if tca_request.asset_class == 'fx' and tca_request.instrument == 'spot':

            # Check if any notionals of any trade/order are quoted in the TERMS currency?
            terms_notionals = trade_df[
                'notional_currency'] == tca_request.ticker[3:6]

            # If any notional are quoted as terms, we should invert these so we quote notionals with base currency
            # for consistency
            if terms_notionals.any():
                inversion_ticker = tca_request.ticker[
                    3:6] + tca_request.ticker[0:3]

                inversion_spot, trade_df = self._fill_reporting_spot(
                    inversion_ticker, trade_df, start_date, finish_date,
                    tca_request)

                notional_fields = [
                    'notional', 'order_notional', 'executed_notional'
                ]

                # Need to check terms notionals again, as trade data could have shrunk (because can only get trades, where we have market data)
                terms_notionals = trade_df['notional_currency'] == str(
                    tca_request.ticker[3:6])

                # Only get the inversion spot if any terms notionals are quoted wrong way around
                if terms_notionals.any():
                    if inversion_spot is not None:
                        for n in notional_fields:
                            if n in trade_inverted_df.columns:
                                # trade_df[n][terms_notionals] = trade_df[n][terms_notionals].values * inversion_spot[terms_notionals].values
                                trade_df[n][terms_notionals] = pd.Series(
                                    index=trade_df.index[
                                        terms_notionals.values],
                                    data=trade_df[n][terms_notionals].values *
                                    inversion_spot[terms_notionals].values)
                    else:
                        logger.warning(
                            "Couldn't get spot data for " + inversion_ticker +
                            " to invert notionals. Hence not returning trading data."
                        )

                if terms_notionals.any():
                    trade_df['notional_currency'][
                        terms_notionals] = trade_request.ticker[0:3]

            # Also represent notional is reporting currency notional amount (eg. if we are USD based investors, convert
            # notional to USDs)

            # Using a reporting currency can be particularly useful if we are trying to aggregate metrics from many different
            # currency pairs (and wish to weight by a commonly measured reporting notional)

            # Eg. if we don't have USDUSD, then we need to convert
            if trade_request.ticker[0:3] != tca_request.reporting_currency:

                # So if we have EURJPY, we want to download EURUSD data
                reporting_ticker = trade_request.ticker[
                    0:3] + tca_request.reporting_currency

                reporting_spot, trade_df = self._fill_reporting_spot(
                    reporting_ticker, trade_df, start_date, finish_date,
                    tca_request)

                if reporting_spot is not None:
                    trade_df[
                        'notional_reporting_currency_mid'] = reporting_spot.values

                    # trade_df['notional_reporting_currency_mid'] = \
                    #     self._time_series_ops.vlookup_style_data_frame(trade_df.index, market_conversion_df, 'mid')[0].values

                    trade_df[
                        'reporting_currency'] = tca_request.reporting_currency

                    columns_to_report = [
                        'executed_notional', 'notional', 'order_notional'
                    ]

                    for c in columns_to_report:
                        if c in trade_df.columns:
                            trade_df[c + '_in_reporting_currency'] = \
                                trade_df['notional_reporting_currency_mid'].values * trade_df[c]
                else:
                    logger.warning(
                        "Couldn't get spot data to convert notionals into reporting currency. Hence not returning trading data."
                    )

                    return None
            else:
                # ie. USDUSD, so spot is 1
                trade_df['notional_reporting_currency_mid'] = 1.0

                # Reporting currency is the same as the notional of the trade, so no need to convert, just
                # replicate columns
                trade_df['reporting_currency'] = tca_request.reporting_currency

                columns_to_report = [
                    'executed_notional', 'notional', 'order_notional'
                ]

                for c in columns_to_report:
                    if c in trade_df.columns:
                        trade_df[c + '_in_reporting_currency'] = trade_df[c]

        return trade_df

    def get_trade_order_holder(self, tca_request):
        logger = LoggerManager.getLogger(__name__)

        # Get all the trade/orders which have been requested, eg. trade_df and order_df
        # do separate calls given they are assumed to be stored in different database tables
        trade_order_holder = DataFrameHolder()

        if tca_request.trade_order_mapping is not None:
            logger.debug("Get trade order holder for " +
                         str(tca_request.ticker) + " from " +
                         str(tca_request.start_date) + " - " +
                         str(tca_request.finish_date))

            for trade_order_type in tca_request.trade_order_mapping:
                trade_order_df = self.get_trade_order_data(
                    tca_request, trade_order_type)

                trade_order_holder.add_dataframe(trade_order_df,
                                                 trade_order_type)

        return trade_order_holder

    def get_market_trade_order_holder(self, tca_request):
        """Gets the both the market data and trade/order data associated with a TCA calculation as a tuple of
        (DataFrame, DataFrameHolder)

        Parameters
        ----------
        tca_request : TCARequest
            Parameters for a TCA calculation

        Returns
        -------
        DataFrame, DataFrameHolder
        """

        logger = LoggerManager.getLogger(__name__)

        logger.debug("Get market and trade/order data for " +
                     str(tca_request.ticker) + " from " +
                     str(tca_request.start_date) + " - " +
                     str(tca_request.finish_date))

        # Get all the trade/orders which have been requested, eg. trade_df and order_df
        # do separate calls given they are assumed to be stored in different database tables
        return self.get_market_data(tca_request), \
               self.get_trade_order_holder(tca_request)

    def calculate_metrics_single_ticker(self, market_trade_order_combo,
                                        tca_request, dummy_market):
        """Calls auxillary methods to get market/trade data for a single ticker. If necessary splits up the request into
        smaller date chunks to collect market and trade data in parallel (using Celery)

        Parameters
        ----------
        tca_request : TCARequest
            Parameter for the TCA analysis

        dummy_market : bool
            Should we put a dummy variable instead of returning market data

        Returns
        -------
        DataFrame, DataFrameHolder, str
        """

        trade_order_filter = tca_request.trade_order_filter
        benchmark_calcs = tca_request.benchmark_calcs
        metric_calcs = tca_request.metric_calcs
        ticker = tca_request.ticker

        logger = LoggerManager.getLogger(__name__)

        # Reassemble market and trade data from the tuple
        market_df, trade_order_df_dict = self.trim_sort_market_trade_order(
            market_trade_order_combo, tca_request.start_date,
            tca_request.finish_date, tca_request.ticker)

        # Calculate BenchmarkMarket's which only require market data and no trade data
        market_df = self.calculate_benchmark_market(market_df, tca_request)

        trade_order_df_values = []
        trade_order_df_keys = []

        # Calculations on trades with market data
        if len(trade_order_df_dict.keys()) > 0 and self._check_valid_market(
                market_df):

            # NOTE: this will not filter orders, only TRADES (as orders do not have venue parameters)
            logger.debug("Filter trades by venue")

            simple_filters = {'venue': tca_request.venue}

            if 'trade_df' in self._util_func.dict_key_list(
                    trade_order_df_dict.keys()):
                for s in simple_filters.keys():
                    trade_order_df_dict[
                        'trade_df'] = self._trade_order_tag.filter_trade_order(
                            trade_order_df=trade_order_df_dict['trade_df'],
                            tag_value_combinations={s: simple_filters[s]})

            # Do additional more customised post-filtering of the trade/orders (eg. by broker_id, algo_id)
            if trade_order_filter is not None:
                for a in trade_order_filter:
                    trade_order_df_dict = a.filter_trade_order_dict(
                        trade_order_df_dict=trade_order_df_dict)

            # NOTE: this will not filter orders, only TRADES (as orders do not have event type parameters)
            simple_filters = {'event_type': tca_request.event_type}

            if 'trade_df' in self._util_func.dict_key_list(
                    trade_order_df_dict.keys()):
                for s in simple_filters.keys():
                    trade_order_df_dict[
                        'trade_df'] = self._trade_order_tag.filter_trade_order(
                            trade_order_df=trade_order_df_dict['trade_df'],
                            tag_value_combinations={s: simple_filters[s]})

            # Remove any trade/orders which aren't empty
            t_remove = []

            for t in trade_order_df_dict.keys():
                if trade_order_df_dict[t] is None:
                    t_remove.append(t)

                    logger.warninging(
                        t + " is empty.. might cause problems later!")
                elif trade_order_df_dict[t].empty:
                    t_remove.append(t)

                    logger.warninging(
                        t + " is empty.. might cause problems later!")

            for t in t_remove:
                trade_order_df_dict.pop(t)

            trade_order_list = self._util_func.dict_key_list(
                trade_order_df_dict.keys())

            # Check if we have any trades/orders left to analyse?
            if len(trade_order_list) == 0:
                logger.error("No trade/orders for " + ticker)
            else:
                # ok we have some trade/orders left to analyse
                if not (isinstance(trade_order_list, list)):
                    trade_order_list = [trade_order_list]

                logger.debug("Calculating derived fields and benchmarks")

                logger.debug("Calculating execution fields")

                # Calculate derived executed fields for orders
                # can only do this if trade_df is also available
                if len(trade_order_df_dict.keys()
                       ) > 1 and 'trade_df' in self._util_func.dict_key_list(
                           trade_order_df_dict.keys()):

                    # For the orders, calculate the derived fields for executed notional, trade etc.
                    aggregated_notional_fields = 'executed_notional'

                    # Calculate the derived fields of the orders from the trades
                    # alao calculate any benchmarks for the orders
                    for i in range(1, len(trade_order_list)):
                        # NOTIONAL_EXECUTED: add derived field for executed price and notional executed for the orders
                        trade_order_df_dict[trade_order_list[
                            i]] = self._metric_executed_price.calculate_metric(
                                lower_trade_order_df=trade_order_df_dict[
                                    trade_order_list[i - 1]],
                                upper_trade_order_df=trade_order_df_dict[
                                    trade_order_list[i]],
                                aggregated_ids=constants.order_name +
                                '_pointer_id',
                                aggregated_notional_fields=
                                aggregated_notional_fields,
                                notional_reporting_currency_spot=
                                'notional_reporting_currency_mid')[0]

                # TODO not sure about this?
                if 'trade_df' in self._util_func.dict_key_list(
                        trade_order_df_dict.keys()):
                    if 'notional' not in trade_order_df_dict[
                            'trade_df'].columns:
                        trade_order_df_dict['trade_df'][
                            'notional'] = trade_order_df_dict['trade_df'][
                                'executed_notional']

                logger.debug("Calculating benchmarks")

                # Calculate user specified benchmarks for each trade order (which has been selected)
                if benchmark_calcs is not None:

                    for i in range(0, len(trade_order_df_dict)):
                        for b in benchmark_calcs:
                            # For benchmarks which need to be generated on a trade by trade basis (eg. VWAP, arrival etc)
                            if not (isinstance(b, BenchmarkMarket)):
                                logger.debug("Calculating " +
                                             type(b).__name__ + " for " +
                                             trade_order_list[i])

                                if trade_order_df_dict[
                                        trade_order_list[i]] is not None:
                                    if not (trade_order_df_dict[
                                            trade_order_list[i]].empty):
                                        trade_order_df_dict[trade_order_list[
                                            i]], _ = b.calculate_benchmark(
                                                trade_order_df=
                                                trade_order_df_dict[
                                                    trade_order_list[i]],
                                                market_df=market_df,
                                                trade_order_name=
                                                trade_order_list[i])

                logger.debug("Calculating metrics")

                # Calculate user specified metrics for each trade order (which has been selected)
                if metric_calcs is not None:
                    for i in range(0, len(trade_order_df_dict)):
                        for m in metric_calcs:
                            logger.debug("Calculating " + type(m).__name__ +
                                         " for " + trade_order_list[i])

                            if trade_order_df_dict[
                                    trade_order_list[i]] is not None:
                                if not (trade_order_df_dict[
                                        trade_order_list[i]].empty):
                                    trade_order_df_dict[trade_order_list[
                                        i]], _ = m.calculate_metric(
                                            trade_order_df=trade_order_df_dict[
                                                trade_order_list[i]],
                                            market_df=market_df,
                                            trade_order_name=trade_order_list[
                                                i])

                logger.debug("Completed derived field calculations for " +
                             ticker)

            trade_order_df_dict = self._calculate_additional_metrics(
                market_df, trade_order_df_dict, tca_request)

            if dummy_market:
                market_df = None

            trade_order_df_keys = self._util_func.dict_key_list(
                trade_order_df_dict.keys())
            trade_order_df_values = []

            for k in trade_order_df_keys:
                trade_order_df_values.append(trade_order_df_dict[k])

        # print("--- dataframes/keys ---")
        # print(trade_order_df_values)
        # print(trade_order_df_keys)

        return market_df, trade_order_df_values, ticker, trade_order_df_keys

    def calculate_benchmark_market(self, market_df, tca_request):

        logger = LoggerManager.getLogger(__name__)

        benchmark_calcs = tca_request.benchmark_calcs
        valid_market = self._check_valid_market(market_df)

        # Calculations on market data only
        if valid_market:
            for b in benchmark_calcs:

                # For benchmarks which only modify market data (and don't need trade specific information)
                if isinstance(b, BenchmarkMarket):
                    logger.debug("Calculating " + type(b).__name__ +
                                 " for market data")

                    market_df = b.calculate_benchmark(market_df=market_df)

        return market_df

    def _check_valid_market(self, market_df):
        if market_df is not None:
            if not (market_df.empty):
                return True

        return False

    def _fill_reporting_spot(self, ticker, trade_df, start_date, finish_date,
                             tca_request):
        logger = LoggerManager.getLogger(__name__)

        market_request = MarketRequest(
            start_date=start_date,
            finish_date=finish_date,
            ticker=ticker,
            data_store=tca_request.market_data_store,
            data_offset_ms=tca_request.market_data_offset_ms,
            use_multithreading=tca_request.use_multithreading,
            market_data_database_table=tca_request.market_data_database_table,
            multithreading_params=tca_request.multithreading_params)

        market_conversion_df = self.get_market_data(market_request)

        # Make sure the trades/orders are within the market data (for the purposes of the reporting spot)
        # we don't need to consider the length of the order, JUST the starting point
        trade_df = self.strip_trade_order_data_to_market(
            trade_df, market_conversion_df, consider_order_length=False)

        reporting_spot = None

        # need to check whether we actually have any trade data/market data
        if trade_df is not None and market_conversion_df is not None:
            if not (trade_df.empty) and not (market_conversion_df.empty):

                try:
                    reporting_spot = \
                        self._time_series_ops.vlookup_style_data_frame(trade_df.index, market_conversion_df, 'mid')[
                            0]

                except:
                    logger.error(
                        "Reporting spot is missing for this trade data sample!"
                    )

                if reporting_spot is None:
                    market_start_finish = "No market data in this sample. "

                    if market_conversion_df is not None:
                        market_start_finish = "Market data is between " + str(
                            market_conversion_df.index[0]) + " - " + str(
                                market_conversion_df.index[-1]) + ". "

                    logger.warning(market_start_finish)
                    logger.warning("Trade data is between " +
                                   str(trade_df.index[0]) + " - " +
                                   str(trade_df.index[-1]) + ".")

                    logger.warning(
                        "Couldn't get spot data to convert notionals currency. Hence not returning trading data."
                    )

        return reporting_spot, trade_df

    def _invert_quoting_market(self, market_df):
        """Inverts the quote data for an FX pair (eg. converts USD/GBP to GBP/USD) by calculating the reciprical. Also
        swaps around the bid/ask fields for consistency.

        Parameters
        ----------
        market_df : DataFrame
            Contains market data, typically quote data

        Returns
        -------
        DataFrame
        """

        if isinstance(market_df, pd.Series):
            market_df = pd.DataFrame(market_df)

        if 'mid' in market_df.columns:
            market_df['mid'] = 1.0 / market_df['mid'].values

        # Need to swap around bid/ask when inverting market data!
        if 'bid' in market_df.columns and 'ask' in market_df.columns:

            market_df['bid'] = 1.0 / market_df['ask'].values
            market_df['ask'] = 1.0 / market_df['bid'].values

        return market_df

    def _get_correct_convention_market_data(self,
                                            market_request,
                                            start_date=None,
                                            finish_date=None):
        """Gets market data for a ticker, when it is in the correct market convention. Otherwise throws an exception.

        Parameters
        ----------
        market_request : MarketRequest
            Parameters for the market data.

        Returns
        -------
        DataFrame
        """

        # Check that cross is in correct convention
        if self._fx_conv.correct_notation(
                market_request.ticker) != market_request.ticker:
            raise Exception(
                'Method expecting only crosses in correct market convention')

        if start_date is None and finish_date is None:
            start_date = market_request.start_date
            finish_date = market_request.finish_date

        return self._get_underlying_market_data(start_date, finish_date,
                                                market_request)

    def _get_underlying_market_data(self, start_date, finish_date,
                                    market_request):
        # Create request for market data
        market_request = MarketRequest(
            start_date=start_date,
            finish_date=finish_date,
            ticker=market_request.ticker,
            data_store=market_request.data_store,
            data_offset_ms=market_request.data_offset_ms,
            market_data_database_table=market_request.
            market_data_database_table)

        # Fetch market data in that ticker (will be tick data)
        market_df = self._data_factory.fetch_table(data_request=market_request)

        # TODO do further filtering of market and trade data as necessary
        if constants.resample_ms is not None:
            market_df = self._time_series_ops.resample_time_series(
                market_df, resample_ms=constants.resample_ms)

            market_df.dropna(inplace=True)

        ## TODO drop stale quotes for market data and add last update time?

        # Calculate mid market rate, if it doesn't exist
        if market_df is not None:
            if not (market_df.empty):
                market_df = self._benchmark_mid.calculate_benchmark(
                    market_df=market_df)

        return market_df

    def trim_sort_market_trade_order(self, market_trade_order_tuple,
                                     start_date, finish_date, ticker):
        """Takes market and trade/order data, then trims it so that the trade/order data is entirely within the
        start/finish date range of market data. If trade/order data does not fully overlap with the market data
        it can cause problems later when computing metrics/benchmarks.

        Parameters
        ----------
        market_trade_order_tuple : tuple
            Tuple of market data with trade/order data

        start_date : datetime
            Start date of TCA analysis

        finish_date : datetime
            Finish data of TCA analysis

        ticker : str
            Ticker

        Returns
        -------
        DataFrame, DataFrame (dict)
        """
        logger = LoggerManager.getLogger(__name__)

        market_df, trade_order_holder = self._convert_tuple_to_market_trade(
            market_trade_order_tuple)
        logger.debug("Filter the market date by start/finish date")

        # Check market data and trade data is not empty!
        market_df = self._time_series_ops.filter_start_finish_dataframe(
            market_df, start_date, finish_date)

        # When reassembling the market data, give user option of sorting it, in case the order of loading was in an odd order
        if market_df is not None and constants.re_sort_market_data_when_assembling:
            if not (market_df.empty):
                logger.debug("Filtered by start/finish date now sorting")

                market_df = market_df.sort_index()

        # Check if there's any market data? if we have none at all, then can't do any TCA, so warn user...
        if market_df is None or len(market_df.index) == 0:
            err_msg = "No market data between selected dates for " + ticker + " between " + str(start_date) + " - " \
                      + str(finish_date)

            logger.warning(err_msg)

            # raise DataMissingException(err_msg)

        logger.debug("Combine trade/order data")

        # Combine all the trades in a single dataframe (and also the same for orders)
        # which are placed into a single dict
        trade_order_df_dict = trade_order_holder.get_combined_dataframe_dict()

        # Make sure the trade data is totally within the market data (if trade data is outside market data, then
        # can't calculate any metrics later)
        for k in self._util_func.dict_key_list(trade_order_df_dict.keys()):
            trade_order_df_dict[k] = self.strip_trade_order_data_to_market(
                trade_order_df_dict[k], market_df)

        # Note, can sometimes get empty results when doing in parallel (eg. split up into days, and don't
        # get for a particular day, so don't raise an exception)
        if not (trade_order_holder.check_empty_combined_dataframe_dict(
                trade_order_df_dict)):
            err_msg = "No trade/order data between selected dates for " + ticker + " between " + str(start_date) + " - " \
                      + str(finish_date)

            logger.warning(err_msg)

            # raise DataMissingException(err_msg)

        return market_df, trade_order_df_dict

    def strip_trade_order_data_to_market(self,
                                         trade_order_df,
                                         market_df,
                                         consider_order_length=True):
        """Strips down the trade/order data so that it is within the market data provided. Hence, trade/order data
        will fully overlap with the market data.

        Parameters
        ----------
        trade_order_df : DataFrame
            Trade/order data from the client

        market_df : DataFrame
            Market data

        consider_order_length : bool (default: True)
            Should we consider the length of the order, when we consider the overlap?

        Returns
        -------
        DataFrame
        """

        if market_df is not None and trade_order_df is not None:
            if not (market_df.empty) and not (trade_order_df.empty):

                add_cond = True

                # For orders (ensure that the start/end time of every order is within the market data start/finish dates)
                # this is important, given that we often want to calculate benchmarks over orders from market data
                if consider_order_length:

                    if 'benchmark_date_start' in trade_order_df.columns and 'benchmark_date_end' in trade_order_df.columns \
                            and trade_order_df is not None:

                        add_cond = (trade_order_df['benchmark_date_start'] >=
                                    market_df.index[0]) & (
                                        trade_order_df['benchmark_date_end'] <=
                                        market_df.index[-1])

                # For trades (ensure that every trade is within the market data start/finish dates)
                trade_order_df = trade_order_df.loc[
                    (trade_order_df.index >= market_df.index[0])
                    & (trade_order_df.index <= market_df.index[-1]) & add_cond]

        return trade_order_df

    def _strip_start_finish_dataframe(self, data_frame, start_date,
                                      finish_date, tca_request):
        """Strips down the data frame to the dates which have been requested in the initial TCA request

        Parameters
        ----------
        data_frame : DataFrame
            Data to be stripped down

        start_date : datetime
            Start date of the computation

        finish_date : datetime
            Finish date of the computation

        tca_request : TCARequest
            Parameters for the TCA request

        Returns
        -------
        DataFrame
        """

        # print(data_frame)

        if start_date != tca_request.start_date:
            if data_frame is not None:
                if not (data_frame.empty):
                    data_frame = data_frame.loc[
                        data_frame.index >= tca_request.start_date]

        if finish_date != tca_request.finish_date:
            if data_frame is not None:
                if not (data_frame.empty):
                    data_frame = data_frame.loc[
                        data_frame.index <= tca_request.finish_date]

        return data_frame

    def _check_is_empty_trade_order(self, trade_df, tca_request, start_date,
                                    finish_date, trade_order_type):

        logger = LoggerManager.getLogger(__name__)

        if trade_df is None:
            logger.warning("Missing trade data for " + tca_request.ticker +
                           " between " + str(start_date) + " - " +
                           str(finish_date) + " in " + trade_order_type)

            return True

        elif trade_df.empty:
            logger.warning("Missing trade data for " + tca_request.ticker +
                           " between " + str(start_date) + " - " +
                           str(finish_date) + " in " + trade_order_type)

            return True

        return False

    @abc.abstractmethod
    def _calculate_additional_metrics(self, market_df, trade_order_df_dict,
                                      tca_request):
        pass

    @abc.abstractmethod
    def _convert_tuple_to_market_trade(self, market_trade_order_tuple):
        pass

    @abc.abstractmethod
    def get_tca_version(self):
        pass