def _join_market_downsampled_trade_orders(self, market_downsampled_df, trade_order_df, fields=None): """Combine market data with trade/orders, into a sparse DataFrame. Typically, used when preparing to display a mixture of market/trades data together. Parameters ---------- market_downsampled_df : DataFrame Market data which has been downsampled trade_order_df : DataFrame Trade/order data to be combined fields : str (list) Fields to keep Returns ------- DataFrame """ logger = LoggerManager.getLogger(__name__) if fields is not None: trade_order_df = self._time_series_ops.filter_time_series_by_matching_columns( trade_order_df, fields) logger.debug('About to join') sparse_market_trade_df = market_downsampled_df.join(trade_order_df, how='outer') # Add buy/sell trade prices in new columns (easier for plotting later) if 'executed_price' not in sparse_market_trade_df.columns: print('x') executed_price = sparse_market_trade_df['executed_price'].values side_to_match = sparse_market_trade_df['side'].values sparse_market_trade_df['buy_trade'] \ = self._time_series_ops.nanify_array_based_on_other(side_to_match, -1, executed_price) # make sells NaN (NOT buys!) sparse_market_trade_df['sell_trade'] \ = self._time_series_ops.nanify_array_based_on_other(side_to_match, 1, executed_price) # make buys NaN (NOT sells!) logger.debug('Finished joining') return sparse_market_trade_df
def _fetch_market_data(self, start, finish, ticker, write_to_disk=True, read_cached_from_disk=True, web_proxies=constants.web_proxies): logger = LoggerManager.getLogger(__name__) key = (str(start) + str(finish) + ticker + '_' + self._get_postfix()).replace(":", '_') filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat util_func = UtilFunc() start_time_stamp = pd.Timestamp(start) finish_time_stamp = pd.Timestamp(finish) if self._remove_weekend_points(): weekend_data = "Weekend? " + key weekday_point = UtilFunc().is_weekday_point(start_time_stamp, finish_time_stamp, friday_close_nyc_hour=constants.friday_close_utc_hour, sunday_open_utc_hour=constants.sunday_open_utc_hour) if not(weekday_point): return None, weekend_data df = None if read_cached_from_disk: if os.path.exists(filename): df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: logger.debug("Read " + filename + " from disk") if df is None: # Convert tcapy ticker into vendor ticker df = self._get_input_data_source().fetch_market_data(start, finish, ticker=self._get_tickers_vendor()[ticker], web_proxies=web_proxies) if df is not None: if write_to_disk: # Write a small temporary dataframe to disk (if the process fails later, these can be picked up, # without having a call the external vendor again util_func.write_dataframe_to_binary(df, filename, format=binary_format) msg = None if df is None: msg = "No data? " + key return df, msg
def combine_resampled_spot_data_into_single_dataframe_usd_base( resample_freq='1min', data_vendor='dukascopy'): df_list = [] logger = LoggerManager.getLogger(__name__) for ticker in ticker_combined_mkt: logger.info("Reading " + ticker + " resample freq " + resample_freq + " data vendor " + data_vendor) df = pd.read_parquet(csv_output + ticker + '_' + resample_freq + '_' + data_vendor + '.' + file_extension) base = ticker[0:3] terms = ticker[3:6] if terms == 'USD': df_invert = pd.DataFrame(index=df.index) df_invert[terms + base + '.close'] = 1.0 / df[ticker + '.close'] df_invert[terms + base + '.open'] = 1.0 / df[ticker + '.open'] # Invert high and low! df_invert[terms + base + '.high'] = 1.0 / df[ticker + '.low'] df_invert[terms + base + '.low'] = 1.0 / df[ticker + '.high'] df_invert[terms + base + '.close'] = 1.0 / df[ticker + '.close'] df_invert[terms + base + '.tickcount'] = df[ticker + '.tickcount'] df = df_invert df_list.append(df) logger.info("Combining all tickers with resample freq " + resample_freq + " data vendor " + data_vendor) df = pd.DataFrame(index=df.index) df['USDUSD.close'] = 1.0 df_list.append(df) df = calculations.pandas_outer_join(df_list) df = df.dropna() combined_file = 'fx_' + resample_freq + '_' + data_vendor + '.' + file_extension df.to_parquet(csv_output + combined_file)
def get(self, key, burn_after_reading=False): """Gets the object(s) associated with the key(s) or CacheHandle(s) Parameters ---------- key : str or CacheHandle (list) Key(s) to be fetched burn_after_reading : bool (default: False) Should the key be erased after reading? Returns ------- object """ logger = LoggerManager.getLogger(__name__) key = copy.copy(key) single = False if not (isinstance(key, list)): key = [key] single = True for i in range(0, len(key)): if isinstance(key[i], CacheHandle): key[i] = key[i].handle_name obj = None try: obj = self._get(key, burn_after_reading=burn_after_reading) except Exception as e: logger.warning("Couldn't retrieve " + str(key) + " from cache: " + str(e)) if ('market_df' in key): print("market_df") if single and obj is not None: return obj[0] return obj
def get_trade_order_holder(self, tca_request): logger = LoggerManager.getLogger(__name__) logger.debug( "Get trade order holder for " + str(tca_request.ticker) + " from " + str(tca_request.start_date) + " - " + str(tca_request.finish_date)) # Get all the trade/orders which have been requested, eg. trade_df and order_df # do separate calls given they are assumed to be stored in different database tables trade_order_holder = DataFrameHolder() if tca_request.trade_order_mapping is not None: for trade_order_type in tca_request.trade_order_mapping: trade_order_df = self.get_trade_order_data(tca_request, trade_order_type) trade_order_holder.add_dataframe(trade_order_df, trade_order_type) return trade_order_holder
def _fill_reporting_spot(self, ticker, trade_df, start_date, finish_date, tca_request): logger = LoggerManager.getLogger(__name__) market_request = MarketRequest(start_date=start_date, finish_date=finish_date, ticker=ticker, data_store=tca_request.market_data_store, data_offset_ms=tca_request.market_data_offset_ms, use_multithreading=tca_request.use_multithreading, multithreading_params=tca_request.multithreading_params) market_conversion_df = self.get_market_data(market_request) # Make sure the trades/orders are within the market data (for the purposes of the reporting spot) # we don't need to consider the length of the order, JUST the starting point trade_df = self.strip_trade_order_data_to_market(trade_df, market_conversion_df, consider_order_length=False) reporting_spot = None # need to check whether we actually have any trade data/market data if trade_df is not None and market_conversion_df is not None: if not (trade_df.empty) and not (market_conversion_df.empty): try: reporting_spot = \ self._time_series_ops.vlookup_style_data_frame(trade_df.index, market_conversion_df, 'mid')[ 0] except: logger.error("Reporting spot is missing for this trade data sample!") if reporting_spot is None: market_start_finish = "No market data in this sample. " if market_conversion_df is not None: market_start_finish = "Market data is between " + str( market_conversion_df.index[0]) + " - " + str(market_conversion_df.index[-1]) + ". " logger.warn(market_start_finish) logger.warn("Trade data is between " + str(trade_df.index[0]) + " - " + str( trade_df.index[-1]) + ".") logger.warn( "Couldn't get spot data to convert notionals currency. Hence not returning trading data.") return reporting_spot, trade_df
def calculate_benchmark_market(self, market_df, tca_request): logger = LoggerManager.getLogger(__name__) benchmark_calcs = tca_request.benchmark_calcs valid_market = self._check_valid_market(market_df) # Calculations on market data only if valid_market: for b in benchmark_calcs: # For benchmarks which only modify market data (and don't need trade specific information) if isinstance(b, BenchmarkMarket): logger.debug("Calculating " + type(b).__name__ + " for market data") market_df = b.calculate_benchmark(market_df=market_df) return market_df
def _download(self, md_request, folder_prefix): from findatapy.market import MarketDataRequest, MarketDataGenerator, Market logger = LoggerManager.getLogger(__name__) market = Market(market_data_generator=MarketDataGenerator()) ticker = md_request.ticker[0] df = market.fetch_market(md_request=md_request) df.columns = ['bid', 'ask', 'bidv', 'askv'] df['venue'] = 'dukascopy' df['ticker'] = ticker df['mid'] = (df['bid'].values + df['ask'].values) / 2.0 self.dump_hdf5_file(df, folder_prefix + "_" + ticker + ".h5") logger.info('Dumped to ' + folder_prefix + "_" + ticker + ".h5")
def _chunk_dataframes(self, obj): logger = LoggerManager.getLogger(__name__) # Can sometime have very large dataframes, which need to be split, otherwise won't fit in a single Redis key mem = obj.memory_usage(deep='deep').sum() mem_float = round(float(mem) / (1024.0 * 1024.0), 3) mem = '----------- ' + str(mem_float) + ' MB -----------' chunks = int(math.ceil(mem_float / constants.volatile_cache_max_cache_chunk_size_mb)) if chunks > 1: obj_list = self._time_series_ops.split_array_chunks(obj, chunks=chunks) else: obj_list = [obj] if obj_list != []: logger.debug("Pandas dataframe of size: " + mem + " in " + str(chunks) + " chunk(s)") return obj_list
def _check_is_empty_trade_order(self, trade_df, tca_request, start_date, finish_date, trade_order_type): logger = LoggerManager.getLogger(__name__) if trade_df is None: logger.warning("Missing trade data for " + tca_request.ticker + " between " + str(start_date) + " - " + str(finish_date) + " in " + trade_order_type) return True elif trade_df.empty: logger.warning("Missing trade data for " + tca_request.ticker + " between " + str(start_date) + " - " + str(finish_date) + " in " + trade_order_type) return True return False
def load_market_calculate_summarize_metrics(self, tca_request, dummy_market=False): """Splits up the TCA request into individual tickers. Market/trade data is loaded for each ticker, before conducting TCA (ie. calculating metrics, benchmarks etc.). Returns a dictionary consisting of market data and another dictionary of trade/order data (and any additional results associated with the TCA) Parameters ---------- tca_request : TCARequest Parameters defining the TCA calculation dummy_market : bool, default False Do we return market data for future use? Returns ------- DataFrame (dict), DataFrame (dict) """ # Load market/trade data and compute metrics/benchmarks etc. per ticker market_df_dict, trade_order_results_df_dict, tca_request_list = \ self.get_market_trade_metrics(tca_request, dummy_market=dummy_market) # If every ticker we have selected doesn't have trades (and are analysis also requires trades), can't do any TCA at all if len(trade_order_results_df_dict) == 0 and tca_request.trade_data_store is not None \ and tca_request.trade_order_mapping is None: logger = LoggerManager.getLogger(__name__) err_msg = "no trade data for specified ticker(s) and time range" logger.error(err_msg) raise DataMissingException(err_msg) # trade_df = trade_order_results_df_dict['trade_df'] # Now summarize those metrics across all the tickers, for easier display return self.summarize_metrics(market_df_dict, trade_order_results_df_dict, tca_request_list, dummy_market=dummy_market)
def _write_df_to_db_single_thread(self, ticker, remove_duplicates=True, if_exists_table='append', if_exists_ticker='replace'): logger = LoggerManager.getLogger(__name__) postfix = '-' + self._get_postfix() + '-with-duplicates' if remove_duplicates: postfix = '-' + self._get_postfix() + '-no-duplicates' filename = os.path.join(self.temp_large_data_folder, ticker + postfix) + '.' + fileformat logger.info("Reading " + filename) util_func = UtilFunc() time_series_ops = TimeSeriesOps() data_source_local = self._get_output_data_source() df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: df = time_series_ops.localize_as_UTC(df) data_source_local.append_market_data( df, ticker, if_exists_table=if_exists_table, if_exists_ticker=if_exists_ticker) else: logger.warn("Couldn't write dataframe for " + ticker + " to database, appears it is empty!")
def get_market_trade_order_holder(self, tca_request): """Gets the both the market data and trade/order data associated with a TCA calculation as a tuple of (DataFrame, DataFrameHolder) Parameters ---------- tca_request : TCARequest Parameters for a TCA calculation Returns ------- DataFrame, DataFrameHolder """ logger = LoggerManager.getLogger(__name__) logger.debug( "Get market and trade/order data for " + str(tca_request.ticker) + " from " + str(tca_request.start_date) + " - " + str(tca_request.finish_date)) # Get all the trade/orders which have been requested, eg. trade_df and order_df # do separate calls given they are assumed to be stored in different database tables return self.get_market_data(tca_request), \ self.get_trade_order_holder(tca_request)
def aggregate_tables(self, df_dict={}, tables_dict={}, round_figures_by=None, scalar=None): logger = LoggerManager.getLogger(__name__) if tables_dict == {}: tables_dict = self._tables_dict if round_figures_by is None: round_figures_by = self._round_figures_by if scalar is None: scalar = self._scalar joined_results = [] table_name = tables_dict['table_name'] table_list = tables_dict['table_list'] column_list = None replace_text = None if 'column_list' in tables_dict.keys(): column_list = tables_dict['column_list'] if 'replace_text' in tables_dict.keys(): replace_text = tables_dict['replace_text'] agg_results = [] for i in range(0, len(table_list)): table = table_list[i] # If the table in the output if table in df_dict.keys(): df = df_dict[table].copy() if column_list is not None and column_list != []: df.columns = [x + ' ' + column_list[i] for x in df.columns] df = self._util_func.replace_text_in_cols(df, replace_text) # Round/multiply elements in the table if requested if df is not None: df = self._time_series_ops.multiply_scalar_dataframe( df, scalar=scalar) df = self._time_series_ops.round_dataframe( df, round_figures_by) agg_results.append(df) else: logger.warning( table + ' not in calculation output, are you use the dictionary entry is correct?' ) # If we've collected the tables, try doing a join on all them # to combine them into one large table if agg_results != []: if len(agg_results) > 1: df_joined = self._time_series_ops.outer_join(agg_results) else: df_joined = agg_results[0] joined_results.append((df_joined, table_name)) return joined_results
def callback(*args): """Calculates the aggregated TCA computation when the "Calculate" button is clicked. Cached the results and then updates the status label when done. Parameters ---------- ticker_val : str(list) tickers (eg. EURUSD, GBPUSD etc) venue_val : str(list) Trading venues start_date_val : str(list) Start date of TCA calculations finish_date_val : str(list) Finish date of TCA calculations reload_val : str Whether underlying market and trade data should be reloaded from dataframe or fetched from cache n_clicks : int Number of time button has been clicked Returns ------- str """ start = time.time() tag = tca_type + '-calculation-button' logger = LoggerManager.getLogger(__name__) logger.debug('Triggered click ' + tca_type) # old_clicks = self._session_manager.get_session_clicks(tag) # make sure none of the other charts are plotted till we have completed this! if tca_type == 'aggregated': uploadbox = args if uploadbox is not None: if isinstance(uploadbox, tuple): uploadbox = uploadbox[0] # Assume that the user uploaded a binary CSV file trade_df = DatabaseSourceCSVBinary( trade_data_database_csv=uploadbox ).fetch_trade_order_data() data_frame_trade_order_mapping = OrderedDict([('trade_df', trade_df)]) start_date = trade_df.index[0] finish_date = trade_df.index[-1] ticker_val = FXConv().correct_unique_notation_list( trade_df['ticker'].unique().tolist()) metric_val = 'slippage' self._session_manager.set_session_flag('metric', value=metric_val) self._session_manager.set_session_flag( 'aggregated-visualization', True) try: #if True: # clear the cache for the current user self._glob_volatile_cache.clear_key_match( self._session_manager.get_session_id()) results_form = [ # show the distribution of the selected metric for trades weighted by notional # aggregated by ticker and then by venue DistResultsForm( trade_order_list=['trade_df'], metric_name=metric_val, aggregate_by_field=[ 'ticker', 'broker_id', 'venue' ], weighting_field= 'executed_notional_in_reporting_currency'), # display the timeline of metrics average by day (and weighted by notional) TimelineResultsForm( trade_order_list=['trade_df'], by_date='date', metric_name=metric_val, aggregation_metric='mean', aggregate_by_field=['ticker'], scalar=10000.0, weighting_field= 'executed_notional_in_reporting_currency'), # display a bar chart showing the average metric weighted by notional and aggregated by ticker # venue BarResultsForm( trade_order_list=['trade_df'], metric_name=metric_val, aggregation_metric='mean', aggregate_by_field=[ 'ticker', 'venue', 'broker_id' ], scalar=10000.0, weighting_field= 'executed_notional_in_reporting_currency'), # create a table the markout of every trade TableResultsForm( trade_order_list=['trade_df'], metric_name='markout', filter_by='all', replace_text={ 'markout_': '', 'executed_notional': 'exec not', 'notional_currency': 'exec not cur' }, keep_fields=[ 'executed_notional', 'side', 'notional_currency' ], scalar={ 'all': 10000.0, 'exclude': ['executed_notional', 'side'] }, round_figures_by={ 'all': 2, 'executed_notional': 0, 'side': 0 }, weighting_field='executed_notional') ] try: #if True: timeline_trade_df_metric_by_ticker = self.get_cached_computation_analysis( key='timeline_trade_df_' + metric_val + '_by_ticker', tca_engine=self._tca_engine, force_calculate=True, tca_request=TCARequest( start_date=start_date, finish_date=finish_date, ticker=ticker_val, tca_type='aggregated', market_data_store='arctic-ncfx', trade_data_store='dataframe', trade_order_mapping= data_frame_trade_order_mapping, metric_calcs=[ MetricSlippage(), MetricMarkout( trade_order_list=['trade_df']) ], results_form=results_form, dummy_market=True, use_multithreading=True)) calc_start = timeline_trade_df_metric_by_ticker.index[ 0] calc_end = timeline_trade_df_metric_by_ticker.index[ -1] aggregated_title = self.create_status_msg_flags( 'aggregated', ticker_val, calc_start, calc_end) logger.debug('Plotted aggregated summary plot!') finish = time.time() except Exception as e: logger.exception(e) return "Status: error - " + str( e ) + ". Check data exists for these dates?" + self.get_username_string( ) except Exception as e: logger.exception(e) return 'Status: error - ' + str( e ) + ". Check data exists for these dates?" + self.get_username_string( ) return 'Status: calculated ' + str( round(finish - start, 3) ) + "s for " + aggregated_title + self.get_username_string( ) raise dash.exceptions.PreventUpdate( "No data changed" ) # not very elegant but only way to prevent plots disappearing
def get_cached_computation_analysis(self, **kwargs): """Fetches a computation outoput from a cache (typically Redis) or computes the analysis directly using another object, if requested. Typically, a computation is initiated and then that large analysis is cached, ready to be consumed by display components which repeatedly call this function. Parameters ---------- kwargs Variables generated by GUI which relate to our computations (eg. start date, finish date, ticker etc.) Returns ------- pd.DataFrame """ try: force_calculate = kwargs['force_calculate'] except: force_calculate = False key = None if 'key' in kwargs: key = kwargs['key'] if 'test' not in kwargs: computation_type = self._tca_engine.get_engine_description() session_id = self._session_manager.get_session_id() + "_expiry_" session_id_computation = session_id + '' + computation_type + '_' else: computation_type = '' session_id = '' session_id_computation = '' # Try to fetch some TCA analysis output from the cache cached_list = self._fetch_cached_list( force_calculate=force_calculate, computation_type=computation_type, session_id=session_id, key=key) # Otherwise force the calculation (or if doesn't exist in the cache!) # when a button is pressed, typically force calculate will be set to True if force_calculate: computation_request = self.create_computation_request(**kwargs) # Delete any existing keys for the current session self._glob_volatile_cache.clear_key_match("*" + session_id + "*") dict_of_df = self.run_computation_request(computation_request) dict_key_list = [] dict_element_list = [] # Cache all the dataframes in Redis/or other memory space (will likely need for later calls!) # from security perspective probably better not to cache the TCAEngine objects on a database (which can execute code) for dict_key in dict_of_df.keys(): # check if we have all the keys filled (will be missing if for example there are no trades) if dict_key not in dict_of_df: raise Exception('Missing ' + dict_key) dict_key_list.append(session_id_computation + dict_key) dict_element_list.append(dict_of_df[dict_key]) self._session_manager.set_session_flag('user_df', dict_key_list) # self._glob_volatile_cache.put(session_id_computation + dict_key, dict_of_df[dict_key]) # Put it back into Redis cache (to be fetched by Dash callbacks) self._glob_volatile_cache.put(dict_key_list, dict_element_list) logger = LoggerManager.getLogger(__name__) logger.debug('Generated tables: ' + str(self._util_func.dict_key_list(dict_of_df.keys()))) if key is None: return None if not (isinstance(key, list)): key = [key] for k in key: # Has one of the dataframes we want, just been calculated, if so return it! if k in dict_of_df.keys(): cached_list.append(dict_of_df[k]) # Otherwise look in Redis for the table for the user else: # as last resort get from our global, this key is unique to each user cached_list.append( self._glob_volatile_cache.get(session_id_computation + k)) # return as tuples tup = list(cached_list) if len(tup) == 1: return tup[0] else: return tup
def read_dataframe_from_binary(self, fname, format=constants.binary_default_dump_format): """Reads a DataFrame which is in HDF5/Parquet file format which was previously written by tcapy Parameters ---------- fname : str Path of binary file format : str (default: 'parquet') What is the binary format? ('parquet' or 'hdf5' are supported) Returns ------- pd.DataFrame """ logger = LoggerManager.getLogger(__name__) # parquet is default choice in tcapy if format == 'parquet': data_frame = None try: if not (os.path.exists(fname)): logger.error("Path doesn't exist for " + fname) return data_frame return pd.read_parquet(fname, engine=constants.parquet_engine) except Exception as e: logger.error("No valid data for " + fname + ': ' + str(e)) return data_frame elif format == 'hdf5': # Needs pytables tables data_frame = None store = None try: if not (os.path.exists(fname)): logger.error("Path doesn't exist for " + fname) return data_frame store = pd.HDFStore(fname) data_frame = store.select("data") except Exception as e: logger.error("No valid data for " + fname + ': ' + str(e)) return data_frame finally: try: if store is not None: store.close() except: pass return data_frame else: logger.warning("Cannot read file " + fname + ", invalid format specified") return None
def calculate_benchmark(self, trade_order_df=None, market_df=None, trade_order_name=None, bid_benchmark=None, ask_benchmark=None, benchmark_date_start_field=None, benchmark_date_end_field=None): if not (self._check_calculate_benchmark( trade_order_name=trade_order_name)): return trade_order_df, market_df # for the specified field (usually 'mid' field) calculate the time weighted average price, which is the simple # average if bid_benchmark is None: bid_benchmark = self._bid_benchmark if ask_benchmark is None: ask_benchmark = self._ask_benchmark if benchmark_date_start_field is None: benchmark_date_start_field = self._benchmark_date_start_field if benchmark_date_end_field is None: benchmark_date_end_field = self._benchmark_date_end_field if bid_benchmark in market_df.columns and ask_benchmark in market_df: trade_order_df[self._benchmark_name] = np.nan date_start = trade_order_df[benchmark_date_start_field].values date_end = trade_order_df[benchmark_date_end_field].values date_start = np.searchsorted(market_df.index, date_start) date_end = np.searchsorted(market_df.index, date_end) bid_price = market_df[bid_benchmark].values ask_price = market_df[ask_benchmark].values dt = market_df.index.to_series( keep_tz=False).diff().values / np.timedelta64(1, 's') dt[0] = 0 # first point should be weighted zero (since don't know how long it's been there) twap = [] for i in range(0, len(trade_order_df.index)): if trade_order_df['side'][i] == 1: price = ask_price elif trade_order_df['side'][i] == -1: price = bid_price try: if date_start[i] == date_end[i]: twap.append(price[date_start[i]]) else: twap_val = np.average( price[date_start[i]:date_end[i]], weights=dt[date_start[i]:date_end[i]]) twap.append(twap_val) except Exception as e: err_msg = "TWAP cannot be calculated, given market data does not fully overlap with trade data: " \ + str(e) LoggerManager.getLogger(__name__).error(err_msg) raise TradeMarketNonOverlapException(err_msg) trade_order_df[self._benchmark_name] = twap else: LoggerManager.getLogger( __name__).warn(bid_benchmark + " and " + ask_benchmark + " may not be in market data.") return trade_order_df, market_df
def calculate_benchmark(self, trade_order_df=None, market_df=None, trade_order_name=None, bid_benchmark=None, ask_benchmark=None, volume_field=None, benchmark_date_start_field=None, benchmark_date_end_field=None): if not (self._check_calculate_benchmark( trade_order_name=trade_order_name)): return trade_order_df, market_df # if fields have not been specified, then take them from the field variables if bid_benchmark is None: bid_benchmark = self._bid_benchmark if ask_benchmark is None: ask_benchmark = self._ask_benchmark if volume_field is None: volume_field = self._volume_field if benchmark_date_start_field is None: benchmark_date_start_field = self._benchmark_date_start_field if benchmark_date_end_field is None: benchmark_date_end_field = self._benchmark_date_end_field if bid_benchmark in market_df.columns and ask_benchmark in market_df.columns and volume_field in market_df.columns: trade_order_df[self._benchmark_name] = np.nan date_start = trade_order_df[benchmark_date_start_field].values date_end = trade_order_df[benchmark_date_end_field].values date_start = np.searchsorted(market_df.index, date_start) date_end = np.searchsorted(market_df.index, date_end) bid_price = market_df[bid_benchmark].values ask_price = market_df[ask_benchmark].values volume = market_df[volume_field].values vwap = [] for i in range(0, len(trade_order_df.index)): if trade_order_df['side'][i] == 1: price = ask_price elif trade_order_df['side'][i] == -1: price = bid_price if date_start[i] == date_end[i]: vwap.append(price[date_start[i]]) else: try: vwap.append( np.average( price[date_start[i]:date_end[i]], weights=volume[date_start[i]:date_end[i]])) except Exception as e: err_msg = "VWAP cannot be calculated, given market data does not fully overlap with trade data: " \ + str(e) LoggerManager.getLogger(__name__).error(err_msg) raise TradeMarketNonOverlapException(err_msg) trade_order_df[self._benchmark_name] = vwap else: LoggerManager.getLogger( __name__).warn(bid_benchmark + ", " + ask_benchmark + " " + volume_field + " may not be in market data") return trade_order_df, market_df
def trim_sort_market_trade_order(self, market_trade_order_tuple, start_date, finish_date, ticker): """Takes market and trade/order data, then trims it so that the trade/order data is entirely within the start/finish date range of market data. If trade/order data does not fully overlap with the market data it can cause problems later when computing metrics/benchmarks. Parameters ---------- market_trade_order_tuple : tuple Tuple of market data with trade/order data start_date : datetime Start date of TCA analysis finish_date : datetime Finish data of TCA analysis ticker : str Ticker Returns ------- DataFrame, DataFrame (dict) """ logger = LoggerManager.getLogger(__name__) market_df, trade_order_holder = self._convert_tuple_to_market_trade( market_trade_order_tuple) logger.debug("Filter the market date by start/finish date") # Check market data and trade data is not empty! market_df = self._time_series_ops.filter_start_finish_dataframe( market_df, start_date, finish_date) # When reassembling the market data, give user option of sorting it, in case the order of loading was in an odd order if market_df is not None and constants.re_sort_market_data_when_assembling: if not (market_df.empty): logger.debug("Filtered by start/finish date now sorting") market_df = market_df.sort_index() # Check if there's any market data? if we have none at all, then can't do any TCA, so warn user... if market_df is None or len(market_df.index) == 0: err_msg = "No market data between selected dates for " + ticker + " between " + str(start_date) + " - " \ + str(finish_date) logger.warning(err_msg) # raise DataMissingException(err_msg) logger.debug("Combine trade/order data") # Combine all the trades in a single dataframe (and also the same for orders) # which are placed into a single dict trade_order_df_dict = trade_order_holder.get_combined_dataframe_dict() # Make sure the trade data is totally within the market data (if trade data is outside market data, then # can't calculate any metrics later) for k in self._util_func.dict_key_list(trade_order_df_dict.keys()): trade_order_df_dict[k] = self.strip_trade_order_data_to_market( trade_order_df_dict[k], market_df) # Note, can sometimes get empty results when doing in parallel (eg. split up into days, and don't # get for a particular day, so don't raise an exception) if not (trade_order_holder.check_empty_combined_dataframe_dict( trade_order_df_dict)): err_msg = "No trade/order data between selected dates for " + ticker + " between " + str(start_date) + " - " \ + str(finish_date) logger.warning(err_msg) # raise DataMissingException(err_msg) return market_df, trade_order_df_dict
def get_market_data(self, market_request): """Gets market data for a particular ticker. When we ask for non-standard FX crosses, only the mid-field is returned (calculated as a cross rate). We do not give bid/ask quotes for calculated non-standard _tickers, as these can difficult to estimate. Parameters ---------- market_request : MarketRequest The type of market data to get Returns ------- DataFrame """ logger = LoggerManager.getLogger(__name__) if isinstance(market_request, TCARequest): market_request = MarketRequest(market_request=market_request) old_ticker = market_request.ticker if market_request.asset_class == 'fx': # Check if we can get ticker directly or need to create synthetic cross rates ticker = self._fx_conv.correct_notation(market_request.ticker) else: # If not FX we don't have to invert ticker = old_ticker # If ticker is in the correct convention is in crosses where we collect data (typically this will be the USD # crosses, also some liquid non-USD pairs like EURJPY) # available_tickers = [] if isinstance(market_request.data_store, DatabaseSource): # TODO improve ticker check here! available_tickers = [ticker] elif 'csv' in market_request.data_store or 'h5' in market_request.data_store or 'gzip' in market_request.data_store \ or 'parquet' in market_request.data_store or isinstance(market_request.data_store, pd.DataFrame) : # For CSV (or H5) we don't have much choice, and could differ between CSV files (if CSV has 'ticker' field, will # match on that) available_tickers = [ticker] elif market_request.data_store in constants.market_data_tickers: available_tickers = self._util_func.dict_key_list( constants.market_data_tickers[ market_request.data_store].keys()) else: err_msg = 'Ticker ' + str( ticker ) + " doesn't seem available in the data source " + market_request.data_store logger.error(err_msg) raise Exception(err_msg) if ticker in available_tickers: # In the correct convention or is not FX if ticker == old_ticker: market_df = self._get_correct_convention_market_data( market_request) # Otherwise need to flip to the correct convention (only will return 'mid') else: market_request_flipped = MarketRequest( market_request=market_request) market_request_flipped.ticker = ticker market_df = self._invert_quoting_market( self._get_correct_convention_market_data( market_request_flipped)) if 'ticker' in market_df.columns: market_df['ticker'] = old_ticker else: if market_request.asset_class == 'fx' and market_request.instrument == 'spot': # Otherwise we need to get both legs # eg. for NZDCAD, we shall download NZDUSD and USDCAD => multiply them to get NZDCAD # get the USD crosses for each leg and then multiply market_request_base = MarketRequest( market_request=market_request) market_request_terms = MarketRequest( market_request=market_request) market_request_base.ticker = old_ticker[0:3] + 'USD' market_request_terms.ticker = 'USD' + old_ticker[3:7] tickers_exist = self._fx_conv.currency_pair_in_list( self._fx_conv.correct_notation(market_request_base.ticker), available_tickers) and \ self._fx_conv.currency_pair_in_list( self._fx_conv.correct_notation(market_request_terms.ticker), available_tickers) # If both USD _tickers don't exist try computing via EUR _tickers? (eg. USDSEK from EURUSD & EURSEK) if not (tickers_exist): market_request_base.ticker = old_ticker[0:3] + 'EUR' market_request_terms.ticker = 'EUR' + old_ticker[3:7] tickers_exist = self._fx_conv.currency_pair_in_list( self._fx_conv.correct_notation(market_request_base.ticker), available_tickers) and \ self._fx_conv.currency_pair_in_list( self._fx_conv.correct_notation(market_request_terms.ticker), available_tickers) # Check if that currency (in the CORRECT convention) is in the available _tickers # we will typically not collect market data for currencies in their wrong convention if tickers_exist: fields_try = ['bid', 'ask', 'mid'] market_base_df = self.get_market_data(market_request_base) market_terms_df = self.get_market_data( market_request_terms) market_has_data = False if market_base_df is not None and market_terms_df is not None: if not (market_base_df.empty) and not ( market_terms_df.empty): market_has_data = True # If there's no data in either DataFrame, don't attempt to calculate anything if not (market_has_data): return pd.DataFrame() fields = [] for f in fields_try: if f in market_base_df.columns and f in market_terms_df.columns: fields.append(f) # Only attempt to calculate if the fields exist if len(fields) > 0: # Remove any other columns (eg. with ticker name etc.) market_base_df = market_base_df[fields] market_terms_df = market_terms_df[fields] # Need to align series to multiply (and then fill down points which don't match) # can't use interpolation, given that would use FUTURE data market_base_df, market_terms_df = market_base_df.align( market_terms_df, join="outer") market_base_df = market_base_df.fillna(method='ffill') market_terms_df = market_terms_df.fillna( method='ffill') market_df = pd.DataFrame(data=market_base_df.values * market_terms_df.values, columns=fields, index=market_base_df.index) # Values at the start of the series MIGHT be nan, so need to ignore those market_df = market_df.dropna(subset=['mid']) if 'ticker' in market_df.columns: market_df['ticker'] = old_ticker else: return None else: # Otherwise couldn't compute either from the USD legs or EUR legs logger.warning("Couldn't find market data for ticker: " + str(ticker)) return None else: # Otherwise couldn't find the non-FX ticker logger.warning("Couldn't find market data for ticker: " + str(ticker)) return None return market_df
def download_from_external_source(self, append_data=True, remove_duplicates=True, if_exists_table='append', if_exists_ticker='append', number_of_days=30 * 7, chunk_int_min=None, start_date=None, finish_date=None, delete_cached_files=False, tickers=None, write_temp_to_disk=True, write_to_disk_db=True, read_cached_from_disk=True, write_large_csv=False, write_large_hdf5_parquet=True, csv_folder=constants.csv_folder, csv_compression=None, return_df=False, web_proxies=constants.web_proxies): """Downloads market data from an external source and then dumps to HDF5/Parquet files for temporary storage which is cached. If HDF5/Parquet cached files already exist for a time segment we read them in, saving us to make an external data call. Lastly, dumps it to an internal database. Parameters ---------- append_data : bool True - only start collecting later data not already in database (ignoring number_of_days parameter) False - start collecting all data, ignoring anything stored in database remove_duplicates : bool True (default) - remove values which are repeated False - leave in repeated values if_exists_table : str 'append' - if database table already exists append data to it 'replace' - remove existing database table if_exists_ticker : str 'append' - if ticker already exists in the database, append to it 'replace' - replace any data for this ticker number_of_days : int Number of days to download data for chunk_int_min : int (None) Size of each download (default - specified in constants) Returns ------- """ # Swim() logger = LoggerManager.getLogger(__name__) if write_to_disk_db: data_source_local = self._get_output_data_source() if write_large_csv: if not (os.path.isdir(csv_folder)): logger.warn("CSV folder " + self.temp_data_folder + " where we are about to write does not exist") # What chunk size in minutes do we want for this data provider? if chunk_int_min is None: chunk_int_min = self._get_download_chunk_min_size() if chunk_int_min is None: chunk_size_str = None else: chunk_size_str = str(chunk_int_min) + "min" if tickers is None: tickers = self._get_tickers() if isinstance(tickers, str): tickers = [tickers] # If there's no start or finish date, choose a default start finish data if start_date is None and finish_date is None: finish_date = datetime.datetime.utcnow() finish_date = datetime.datetime(finish_date.year, finish_date.month, finish_date.day, 0, 0, 0, 0) start_date = finish_date - timedelta(days=number_of_days) # 30*7 else: start_date = self.time_series_ops.date_parse(start_date) finish_date = self.time_series_ops.date_parse(finish_date) if finish_date < start_date: logger.error("Download finish date is before start data!") return now = pd.Timestamp(datetime.datetime.utcnow(), tz='utc') # Do not allow downloading of future data! if finish_date > now: finish_date = now df_dict = {} # Loop through each ticker for ticker in tickers: has_old = False if delete_cached_files and write_to_disk_db: logger.info("Deleting all cached temp files for " + ticker) for name in glob.glob(self.temp_data_folder + '/*' + ticker + "*"): try: os.remove(name) except: logger.warn("Couldn't delete file " + name) logger.info("Finished deleting cached files for " + ticker) # If we have been asked to append data, load up what you can from the internal database # find the last point if append_data and if_exists_ticker == 'append' and write_to_disk_db: logger.info("Trying to download old data first for " + ticker) try: df_old = data_source_local.fetch_market_data( start_date, finish_date, ticker, web_proxies=web_proxies) # This will vary between tickers (in particular if we happen to add a new ticker) start_date = df_old.index[-1] has_old = True # Remove reference - big file! df_old = None except Exception as e: logger.info("No data found for ticker " + ticker + " with error: " + str(e)) else: logger.info("Downloading new data for " + ticker + ".") # Date range may not work with timezones start_date = pd.Timestamp(start_date.replace(tzinfo=None)) finish_date = pd.Timestamp(finish_date.replace(tzinfo=None)) if finish_date - start_date < pd.Timedelta(days=1): start_date_list = [start_date, finish_date] else: # download from that last point to the present day start_date_list = pd.date_range(start_date, finish_date) start_date_list = [ pd.Timestamp(x.to_pydatetime()) for x in start_date_list ] if finish_date > start_date_list[-1]: start_date_list.append(finish_date) df = None filename = os.path.join(self.temp_data_folder, ticker) + '.' + fileformat try: # df = UtilFunc().read_dataframe_from_hdf(filename) pass except: logger.info("Couldn't read HDF5/Parquet file for " + ticker) # Create downloads in x minute chunks (if we request very large chunks of data with certain data providers, # we could cause problems!) if df is None: df_remote_list = [] # Loop by day (otherwise can end up with too many open files!) for i in range(0, len(start_date_list) - 1): if chunk_size_str is not None: if start_date_list[ i + 1] - start_date_list[i] < pd.Timedelta( minutes=chunk_int_min): start_date_hist = [start_date_list[i]] finish_date_hist = [start_date_list[i + 1]] else: start_date_hist, finish_date_hist = UtilFunc( ).split_into_freq(start_date_list[i], start_date_list[i + 1], freq=chunk_size_str) else: start_date_hist = [start_date_list[i]] finish_date_hist = [start_date_list[i + 1]] # For FX and most other markets we should remove weekends (cryptocurrencies do have weekend data) if self._remove_weekend_points(): start_date_hist, finish_date_hist = UtilFunc( ).remove_weekend_points(start_date_hist, finish_date_hist) output = [] if constants.use_multithreading: # Create a multiprocess object for downloading data swim = Swim(parallel_library=constants. database_populator_threading_library) pool = swim.create_pool(thread_no=self._get_threads()) result = [] for i in range(0, len(start_date_hist)): # output.append(self._fetch_market_data(start_date_hist[i], finish_date_hist[i], ticker)) result.append( pool.apply_async( self._fetch_market_data, args=(start_date_hist[i], finish_date_hist[i], ticker, write_temp_to_disk, read_cached_from_disk, web_proxies))) output = [p.get() for p in result] swim.close_pool(pool, True) else: # Otherwise run in single threaded fashion for i in range(0, len(start_date_hist)): output.append( self._fetch_market_data( start_date_hist[i], finish_date_hist[i], ticker, write_to_disk=write_temp_to_disk, read_cached_from_disk=read_cached_from_disk, web_proxies=web_proxies)) # Get all the dataframe chunks and returned messages df_list = [ self._remove_duplicates_time_series(x, remove_duplicates, field='mid') for x, y in output if x is not None ] msg_list = [ y for x, y in output if x is not None and y is not None ] # Concatenate all the 5 (or larger) minute data chunks try: if df_list != []: df_temp = pd.concat(df_list) if df_temp is not None: if not (df_temp.empty): df_remote_list.append(df_temp) except Exception as e: logger.error(str(e)) if df_remote_list != []: df = pd.concat(df_remote_list) # Need to sort data (database assumes sorted data for chunking/searches) df = df.sort_index() df = self.time_series_ops.localize_as_UTC(df) if write_large_hdf5_parquet: if df is not None: if not (df.empty): key = '_' + self._get_postfix() + "_" + \ (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_') filename = os.path.join( csv_folder, ticker + key) + '.' + fileformat # Temporary cache for testing purposes (also if the process crashes, we can read this back in) UtilFunc().write_dataframe_to_binary( df, filename, format=binary_format) if df is not None: # Assume UTC time (don't want to mix UTC and non-UTC in database!) df = self.time_series_ops.localize_as_UTC(df) # write CSV if write_large_csv: if df is not None: if not (df.empty): key = '_' + self._get_postfix() + "_" + \ (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_') if csv_compression is 'gzip': df.to_csv(os.path.join(csv_folder, ticker + key + ".csv.gz"), compression='gzip') else: df.to_csv( os.path.join(csv_folder, ticker + key + ".csv")) if return_df: df_dict[ticker] = df # Dump what we have locally (or whatever DatabaseSource we have defined) try: start_date = start_date.replace(tzinfo=pytz.utc) # Remove first point if matches last point from dataset if has_old: if df.index[0] == start_date: df = df[-1:] if df is not None: df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, field='mid') if write_to_disk_db and df is not None: data_source_local.append_market_data( df, ticker, if_exists_table=if_exists_table, if_exists_ticker=if_exists_ticker) logger.info("Wrote to database for " + ticker) except Exception as e: final_err = "Data was missing for these dates " + str(start_date) + " - " + str(finish_date) + " for " \ + str(tickers) + " Didn't write anything to disk or return any valid dataframe: " + str(e) logger.error(final_err) if df is None: msg_list.append("No downloaded data for " + str(start_date) + " - " + str(finish_date) + ". Is this a holiday?") # Returns a status containing any failed downloads, which can be read by a user return msg_list, df_dict
# See the License for the specific language governing permissions and limitations under the License. # import os from tcapy.conf.constants import Constants from tcapy.util.loggermanager import LoggerManager from tcapy.data.databasesource import DatabaseSourceCSV from tcapy.data.databasesource import DatabaseSourceArctic, DatabaseSourcePyStore, DatabaseSourceInfluxDB, DatabaseSourceKDB constants = Constants() if __name__ == '__main__': logger = LoggerManager.getLogger(__name__) PLOT_BACK_DATA = False data_vendor = 'ncfx' # 'dukascopy' or 'ncfx' # Either use 'arctic' or 'pystore' or 'influxdb' or 'kdb' to store market tick data market_data_store = 'arctic' logger.info("About to upload data to " + market_data_store) ## YOU WILL NEED TO CHANGE THE BELOW LINES ######################################################################### # Parameters for testing if True: data_vendor = 'testharness'
def calculate_benchmark(self, trade_order_df=None, market_df=None, trade_order_name=None, bid_benchmark=None, ask_benchmark=None, weighting_field=None, benchmark_date_start_field=None, benchmark_date_end_field=None, start_time_before_offset=None, finish_time_after_offset=None, overwrite_time_of_day=None, overwrite_timezone=None): if self._check_empty_benchmark_market_trade_data(trade_order_name, trade_order_df, market_df): return trade_order_df, market_df # If fields have not been specified, then take them from the field variables if bid_benchmark is None: bid_benchmark = self._bid_benchmark if ask_benchmark is None: ask_benchmark = self._ask_benchmark if weighting_field is None: weighting_field = self._weighting_field if benchmark_date_start_field is None: benchmark_date_start_field = self._benchmark_date_start_field if benchmark_date_end_field is None: benchmark_date_end_field = self._benchmark_date_end_field if start_time_before_offset is None: start_time_before_offset = self._start_time_before_offset if finish_time_after_offset is None: finish_time_after_offset = self._finish_time_after_offset if overwrite_time_of_day is None: overwrite_time_of_day = self._overwrite_time_of_day if overwrite_timezone is None: overwrite_timezone = self._overwrite_timezone if weighting_field is not None: weighting_field_condition = True else: weighting_field_condition = weighting_field in market_df.columns if bid_benchmark in market_df.columns and ask_benchmark in market_df.columns and weighting_field_condition: trade_order_df[self._benchmark_name] = np.nan if benchmark_date_start_field is not None and benchmark_date_end_field is not None and \ benchmark_date_start_field in trade_order_df.columns and benchmark_date_end_field in trade_order_df.columns: date_start = trade_order_df[benchmark_date_start_field].values date_end = trade_order_df[benchmark_date_end_field].values else: date_start = trade_order_df.index.values date_end = trade_order_df.index.values # Overwrite every trade/order start/end time by a specific time of day if this has been specified if overwrite_time_of_day is not None and overwrite_timezone is not None: date_start = self._time_series_ops.overwrite_time_of_day_in_datetimeindex(date_start, overwrite_time_of_day, old_tz=trade_order_df.index.tz, overwrite_timezone=overwrite_timezone) date_end = self._time_series_ops.overwrite_time_of_day_in_datetimeindex(date_end, overwrite_time_of_day, old_tz=trade_order_df.index.tz, overwrite_timezone=overwrite_timezone) # Subtract a user defined time from the start time of the order (or point in time for a trade) if specifed if start_time_before_offset is not None: date_start = date_start - self._time_series_ops.get_time_delta(start_time_before_offset) # Add a user defined time from the finish time of the order (or point in time for a trade) if specified if finish_time_after_offset is not None: date_end = date_end + self._time_series_ops.get_time_delta(finish_time_after_offset) date_start = np.searchsorted(market_df.index, date_start) date_end = np.searchsorted(market_df.index, date_end) bid_price = market_df[bid_benchmark].values ask_price = market_df[ask_benchmark].values try: trade_order_df[self._benchmark_name] = \ self._benchmark_calculation(trade_order_df, bid_price, ask_price, date_start, date_end, weights=self._generate_weights(market_df, weighting_field=weighting_field)) except: LoggerManager.getLogger(__name__).warning( self._benchmark_name + " not calculated (check if has correct input fields)") else: LoggerManager.getLogger(__name__).warning( bid_benchmark + ", " + ask_benchmark + " " + weighting_field + " may not be in market data") return trade_order_df, market_df
def _fetch_market_data(self, start, finish, ticker, write_to_disk=True, read_cached_from_disk=True, web_proxies=constants.web_proxies): logger = LoggerManager.getLogger(__name__) key = (str(start) + str(finish) + ticker + '_' + self._get_postfix()).replace(":", '_') filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat util_func = UtilFunc() start_time_stamp = pd.Timestamp(start) finish_time_stamp = pd.Timestamp(finish) if self._remove_saturday(): weekend_data = "Saturday? " + key # Ignore Saturday, and don't attempt to download if start_time_stamp.dayofweek == 5 or finish_time_stamp.dayofweek == 5: return None, weekend_data if self._remove_weekend_points(): weekend_data = "Weekend? " + key if start_time_stamp.dayofweek == 6 and start_time_stamp.hour < 20: return None, weekend_data if start_time_stamp.dayofweek == 4 and start_time_stamp.hour > 22: return None, weekend_data df = None if read_cached_from_disk: if os.path.exists(filename): df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: logger.debug("Read " + filename + " from disk") if df is None: # Convert tcapy ticker into vendor ticker df = self._get_input_data_source().fetch_market_data( start, finish, ticker=self._get_tickers_vendor()[ticker], web_proxies=web_proxies) if df is not None: df = df.drop('ticker', axis=1) if write_to_disk: # Write a small temporary dataframe to disk (if the process fails later, these can be picked up, # without having a call the external vendor again util_func.write_dataframe_to_binary(df, filename, format=binary_format) msg = None if df is None: msg = "No data? " + key return df, msg
def _parallel_get_market_trade_metrics(self, tca_request_list, dummy_market): logger = LoggerManager.getLogger(__name__) market_holder_list = DataFrameHolder() trade_order_holder_list = DataFrameHolder() # For each currency pair select collect the trades and market data, then calculate benchmarks and slippage result = [] keep_looping = True # If we have also asked for trades/order if tca_request_list[0].trade_order_mapping is not None: point_in_time_executions_only = \ self._util_func.dict_key_list(tca_request_list[0].trade_order_mapping) == ['trade_df'] else: point_in_time_executions_only = True parallel_library = tca_request_list[0].multithreading_params[ 'parallel_library'] if parallel_library == 'single': # from tcapy.analysis.tcatickerloaderimpl import TCATickerLoaderImpl tca_ticker_loader = Mediator.get_tca_ticker_loader( version=self._version) start_date = tca_request_list[0].start_date finish_date = tca_request_list[0].finish_date # Parameters for the loop i = 0 no_of_tries = 5 # Error trapping for Celery, if have failed event retry it while i < no_of_tries and keep_looping: try: # For each TCA request kick off a thread for tca_request_single_ticker in tca_request_list: # Split up the request by date (monthly/weekly chunks) tca_request_date_split = self._split_tca_request_by_date( tca_request_single_ticker, tca_request_single_ticker.ticker, period=tca_request_single_ticker. multithreading_params['cache_period']) if not(constants.multithreading_params['splice_request_by_dates']) \ or tca_request_list[0].tca_type == 'detailed' \ or tca_request_list[0].tca_type == 'compliance' \ or tca_request_list[0].summary_display == 'candlestick'\ or not(point_in_time_executions_only): if 'celery' in parallel_library: # Load all the data for this ticker and THEN calculate the metrics on it result.append( chord( (get_market_trade_holder_via_celery.s( tca_request_data) for tca_request_data in tca_request_date_split), calculate_metrics_single_ticker_via_celery. s(tca_request_single_ticker, dummy_market)).apply_async()) elif parallel_library == 'single': # This is not actually parallel, but is mainly for debugging purposes for tca_request_s in tca_request_date_split: # print(tca_request_s.start_date) market_df, trade_order_df_dict = tca_ticker_loader.get_market_trade_order_holder( tca_request_s, return_cache_handles=False) market_df, trade_order_df_list, ticker, trade_order_keys = \ tca_ticker_loader.calculate_metrics_single_ticker((market_df, trade_order_df_dict), tca_request_s, dummy_market) market_holder_list.add_dataframe( market_df, ticker) trade_order_holder_list.add_dataframe_dict( dict( zip(trade_order_keys, trade_order_df_list))) else: # Otherwise work on parallel chunks by date # doesn't currently work with orders which straddle day/week/month boundaries # but should work with points in time # # In practice, it's not really much faster than the above code if 'celery' == parallel_library: # For each ticker/date combination load data and process chunk (so can do fully in parallel) result.append( group( get_market_trade_holder_and_calculate_metrics_single_ticker_via_celery .s(tca_request_data, dummy_market) for tca_request_data in tca_request_date_split).apply_async()) # Now combine the results from the parallel operations, if using celery if 'celery' in parallel_library: # Careful, when the output is empty! output = [ p.get(timeout=constants.celery_timeout_seconds) for p in result if p is not None ] # If pipelined/splice_request_by_dates will have two lists so flatten it into one output = self._util_func.flatten_list_of_lists(output) for market_df, trade_order_df_list, ticker, trade_order_keys in output: market_holder_list.add_dataframe(market_df, ticker) # market_df_dict[ticker] = market_df trade_order_holder_list.add_dataframe_dict( dict(zip(trade_order_keys, trade_order_df_list))) del result del output keep_looping = False except DateException as e: raise e keep_looping = False except TradeMarketNonOverlapException as e: raise e keep_looping = False except DataMissingException as e: raise e keep_looping = False except ErrorWritingOverlapDataException as e: raise e keep_looping = False # Exception likely related to Celery and possibly lack of communication with Redis message broker # or Memcached results backend # except Exception as e: except Exception as e: if i == no_of_tries - 1: err_msg = "Failed with " + parallel_library + " after multiple attempts: " + str( e) + ", " + str(traceback.format_exc()) raise Exception(err_msg) i = i + 1 logger.warning("Failed with " + parallel_library + ", trying again for " + str(i) + " time: " + str(e) + ", " + str(traceback.format_exc())) logger.debug("Finished parallel computation") # Expand out the DataFrame holders into dictionaries of DataFrames market_df_dict = market_holder_list.get_combined_dataframe_dict() trade_order_results_df_dict = trade_order_holder_list.get_combined_dataframe_dict( start_date=start_date, finish_date=finish_date) # TODO add candlestick drawing here for cases when using split threading by date trade_order_results_df_dict = self._util_func.remove_keymatch_dict( trade_order_results_df_dict, 'market_df_downsampled') return market_df_dict, trade_order_results_df_dict
def _apply_summary_metrics(self, tca_request_list, trade_order_results_df_dict, market_df_dict): trade_order_list = self._util_func.dict_key_list( trade_order_results_df_dict.keys()) market_list = self._util_func.dict_key_list(market_df_dict.keys()) if not (isinstance(trade_order_list, list)): trade_order_list = [trade_order_list] if not (isinstance(market_list, list)): market_list = [market_list] # First get the market data (for doing bid/ask on distributions) - only does the first ticker! market_df = market_df_dict[tca_request_list[0].ticker] logger = LoggerManager.getLogger(__name__) logger.debug("Constructing results form to summarize analysis...") # Calculate user specified aggregate result forms (eg. timelines, distribution etc.) for each trade/order # which has been selected results_form = tca_request_list[0].results_form join_tables = tca_request_list[0].join_tables # If dummy market (ie. don't return market data to the user) has been specified then market data cannot # be included in ResultsForm calculations if results_form is not None: # Go through all the trade/orders doing statistical aggregations for i in range(0, len(trade_order_results_df_dict)): # Ignore 'fig' objects which are Plotly JSON Figures, and only process DataFrames if 'df' in trade_order_list[i]: for r in results_form: # Filter the trades for the event type which has been requested (eg. 'trade' or 'placement') trade_order_df = self._trade_order_tag.filter_trade_order( trade_order_df=trade_order_results_df_dict[ trade_order_list[i]], tag_value_combinations={ 'event_type': tca_request_list[0].event_type }) # Calculate aggregate ResultForm results = r.aggregate_results( market_trade_order_df=trade_order_df, market_df=market_df, market_trade_order_name=trade_order_list[i]) if results[0] is not None: for results_form_df, results_form_name in results: trade_order_results_df_dict[ results_form_name] = results_form_df # Go through all the market data doing statistical aggregations for i in range(0, len(market_df_dict)): # Ignore 'fig' objects which are Plotly JSON Figures, and only process DataFrames which are not empty if 'fig' not in market_list[i] and market_df_dict[ market_list[i]] is not None: if not (market_df_dict[market_list[i]].empty): for r in results_form: # Calculate aggregate ResultForm results = r.aggregate_results( market_trade_order_df=market_df_dict[ market_list[i]], market_df=market_df_dict[market_list[i]], market_trade_order_name=market_list[i]) if results[0] is not None: for results_form_df, results_form_name in results: trade_order_results_df_dict[ results_form_name] = results_form_df logger.debug("Now join table results...") # As a final stage, join together any tables which have been specified by the user # for example: does the user want to combine certain metrics or trades together? if join_tables is not None: for j in join_tables: results = j.aggregate_tables( df_dict=trade_order_results_df_dict) if results != []: if results[0] is not None: for results_form_df, results_form_name in results: trade_order_results_df_dict[ results_form_name] = results_form_df logger.debug( "Finished calculating results form and join table results!") return trade_order_results_df_dict
def _combine_mini_df_from_disk_single_thread(self, ticker, remove_duplicates=True): logger = LoggerManager.getLogger(__name__) time_series_ops = TimeSeriesOps() logger.info('Getting ' + ticker + ' filenames...') temp_data_folder = self.temp_data_folder filename_list = [] for root, dirnames, filenames in os.walk(temp_data_folder): for filename in filenames: if ticker in filename and '.' + fileformat in filename: filename_h5_parquet = os.path.join(root, filename) # if filename is less than 10MB add (otherwise likely a very large aggregated file!) if os.path.getsize(filename_h5_parquet) < 10 * 1024 * 1024: filename_list.append(filename_h5_parquet) df_list = [] util_func = UtilFunc() logger.info('Loading ' + ticker + ' mini dataframe into memory') i = 0 if len(filename_list) == 0: logger.warn("Looks like there are no files for " + ticker + " in " + temp_data_folder + ". Are you sure path is correct?") # Go through each mini file which represents a few minutes of data and append it for filename in filename_list: filesize = 0 try: filesize = os.path.getsize(filename) / 1024.0 df = util_func.read_dataframe_from_binary(filename, format=binary_format) i = i + 1 # every 100 files print reading output@ if i % 100 == 0: logger.info('Reading ' + filename + ' number ' + str(i)) if df is not None: df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, time_series_ops, field='mid') df_list.append(df) except Exception as e: logger.warn('Failed to parse ' + filename + " of " + str(filesize) + "KB") # + str(e)) # if i > 1000: # break # Assume UTC time (don't want to mix UTC and non-UTC in database!) if df_list == []: logger.warn('No dataframe read for ' + ticker + ', cannot combine!') return logger.info('About to combine ' + ticker + ' into large dataframe to write to disk...') df = pd.concat(df_list) df = time_series_ops.localize_as_UTC(df) df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, time_series_ops, field='mid') postfix = '-' + self._get_postfix() + '-with-duplicates' if remove_duplicates: postfix = '-' + self._get_postfix() + '-no-duplicates' filename = os.path.join(self.temp_large_data_folder, ticker + postfix) + '.' + fileformat df = time_series_ops.localize_as_UTC(df) util_func.write_dataframe_to_binary(df, filename, format=binary_format)
def callback(*args): """Kicks off fetching of data of market data and TCA calculations for a specific currency pair. Caches the data in a VolatileCache instance, ready to be read in by the other charts. Parameters ---------- ticker_val : str ticker to be used in TCA calculations start_date_val : str Start date of TCA analysis start_time_val : str Start time of TCA analysis finish_date_val : str Finish date of TCA analysis finish_time_val : str Finish time of TCA analysis venue_val : str Venue data to be used n_clicks : int Number of clicks Returns ------- str """ start = time.time() tag = tca_type + '-calculation-button' old_clicks = self._session_manager.get_session_clicks(tag) # make sure none of the other charts/links are plotted till we have completed this! self._session_manager.set_session_flag([ self._plot_flags['aggregated'], self._plot_flags['detailed'], self._plot_flags['compliance'] ], False) logger = LoggerManager.getLogger(__name__) if tca_type == 'detailed': ticker_val, start_date_val, start_time_val, finish_date_val, finish_time_val, \ broker_val, algo_val, venue_val, market_data_val, metric_val, n_clicks = args # Catch cases where users repeatedly click, which can cause misalignment in clicks self._session_manager.set_session_clicks(tag, n_clicks, old_clicks=old_clicks) logger.debug( self.create_generate_button_msg(old_clicks, n_clicks)) # Make sure all the parameters have been selected if ticker_val != '' and venue_val != '' and start_date_val != '' and start_time_val != '' and \ finish_date_val != '' and finish_time_val != '' and market_data_val != '' and broker_val != '' and \ algo_val != '' and n_clicks > old_clicks: # Expand tickers/broker fields etc, in case for example 'All' has been specified or any other groups broker_val = self._util_func.populate_field( broker_val, constants.available_brokers_dictionary, exception_fields='All') algo_val = self._util_func.populate_field( algo_val, constants.available_algos_dictionary, exception_fields='All') venue_val = self._util_func.populate_field( venue_val, constants.available_venues_dictionary, exception_fields='All') # Combine the start date/time and finish date/time start_date_val = start_date_val + ' ' + start_time_val finish_date_val = finish_date_val + ' ' + finish_time_val metric_val = metric_val.replace(' ', '_') logger.debug('Calculation click old: ' + str(old_clicks) + " clicks vs new " + str(n_clicks)) self._session_manager.set_session_clicks(tag, n_clicks) self._session_manager.set_session_flag('metric', value=metric_val) self._session_manager.set_session_flag( 'detailed-visualization', value=True) logger.info('Selected ' + ticker_val + " " + start_date_val + " - " + finish_date_val) # Check that dates are less than 1 month apart if pd.Timestamp(finish_date_val) - pd.Timestamp( start_date_val) > pd.Timedelta( days=constants.max_plot_days): return "Status: Cannot plot more than " + str( constants.max_plot_days) + " days!" elif pd.Timestamp(start_date_val) >= pd.Timestamp( finish_date_val): return "Status: Start date must be before the end date" try: #if True: # Clear the cache for the current user self._glob_volatile_cache.clear_key_match( self._session_manager.get_session_id()) results_form = [ # Calculate the distribute of the metric for trades/orders, broken down by trade side (buy/sell) DistResultsForm( trade_order_list=['trade_df', 'order_df'], metric_name=metric_val, aggregate_by_field='side', scalar=10000.0, weighting_field= 'executed_notional_in_reporting_currency'), # Create a table the markout of every trade TableResultsForm( trade_order_list=['trade_df'], metric_name='markout', filter_by='all', replace_text={ 'markout_': '', 'executed_notional': 'exec not', 'notional_currency': 'exec not cur' }, keep_fields=[ 'executed_notional', 'side', 'notional_currency' ], scalar={ 'all': 10000.0, 'exclude': ['executed_notional', 'side'] }, round_figures_by={ 'all': 2, 'executed_notional': 0, 'side': 0 }, weighting_field='executed_notional') ] benchmark_calcs = [ # Calculate the arrival prices for every trade/order BenchmarkArrival( trade_order_list=['trade_df', 'order_df']), # Calculate the VWAP for each order BenchmarkVWAP(trade_order_list=['order_df']), # Calculate the TWAP for each order BenchmarkTWAP(trade_order_list=['order_df']) ] metric_calcs = [ metric_val, MetricMarkout(trade_order_list=['trade_df']) ] # Get from cache, note given that we are in the first part of the chain we should force it to calculate! sparse_market_trade_df = self.get_cached_computation_analysis( key='sparse_market_trade_df', start_date=start_date_val, finish_date=finish_date_val, ticker=ticker_val, venue=venue_val, market_data=market_data_val, event_type='trade', dummy_market=False, broker=broker_val, algo=algo_val, metric_calcs=metric_calcs, metric_trade_order_list=['trade_df', 'order_df'], benchmark_calcs=benchmark_calcs, tca_type='detailed', tca_engine=self._tca_engine, results_form=results_form, force_calculate=True) calc_start = sparse_market_trade_df.index[0] calc_end = sparse_market_trade_df.index[-1] detailed_title = self.create_status_msg_flags( 'detailed', ticker_val, calc_start, calc_end) except Exception as e: LoggerManager().getLogger(__name__).exception(e) return "Status: error " + str(e) + ". Check dates?" finish = time.time() return 'Status: calculated ' + str(round( finish - start, 3)) + "s for " + detailed_title elif tca_type == 'aggregated': ticker_val, start_date_val, finish_date_val, broker_val, algo_val, venue_val, reload_val, market_data_val, \ event_type_val, metric_val, n_clicks = args # Catch cases where users repeatedly click, which can cause misalignment in clicks self._session_manager.set_session_clicks(tag, n_clicks, old_clicks=old_clicks) logger.debug( self.create_generate_button_msg(old_clicks, n_clicks)) if ticker_val != '' and start_date_val != '' and venue_val != '' \ and finish_date_val != '' and reload_val != '' and event_type_val != '' and metric_val != '' and \ n_clicks > old_clicks: # Expand tickers/broker fields etc, in case for example 'All' has been specified or any other groups ticker_val_list = self._util_func.populate_field( ticker_val, constants.available_tickers_dictionary) broker_val_list = self._util_func.populate_field( broker_val, constants.available_brokers_dictionary) algo_val_list = self._util_func.populate_field( algo_val, constants.available_algos_dictionary) venue_val_list = self._util_func.populate_field( venue_val, constants.available_venues_dictionary) metric_val = metric_val.replace(' ', '_') logger.debug('Calculation click old: ' + str(old_clicks) + " clicks vs new " + str(n_clicks)) self._session_manager.set_session_clicks(tag, n_clicks) self._session_manager.set_session_flag('metric', value=metric_val) self._session_manager.set_session_flag( 'aggregated-visualization', True) try: # if True: # Clear the cache for the current user self._glob_volatile_cache.clear_key_match( self._session_manager.get_session_id()) results_form = [ # Show the distribution of the selected metric for trades weighted by notional # aggregated by ticker and then by venue DistResultsForm( trade_order_list=['trade_df'], metric_name=metric_val, aggregate_by_field=['ticker', 'venue'], weighting_field= 'executed_notional_in_reporting_currency'), # Display the timeline of metrics average by day (and weighted by notional) TimelineResultsForm( trade_order_list=['trade_df'], by_date='date', metric_name=metric_val, aggregation_metric='mean', aggregate_by_field='ticker', scalar=10000.0, weighting_field= 'executed_notional_in_reporting_currency'), # Display a bar chart showing the average metric weighted by notional and aggregated by ticker # venue BarResultsForm( trade_order_list=['trade_df'], metric_name=metric_val, aggregation_metric='mean', aggregate_by_field=['ticker', 'venue'], scalar=10000.0, weighting_field= 'executed_notional_in_reporting_currency') ] try: # if True: timeline_trade_df_metric_by_ticker = self.get_cached_computation_analysis( key='timeline_trade_df_' + metric_val + '_by_ticker', start_date=start_date_val, finish_date=finish_date_val, event_type=event_type_val, ticker=ticker_val_list, broker=broker_val_list, algo=algo_val_list, venue=venue_val_list, market_data=market_data_val, dummy_market=True, tca_engine=self._tca_engine, tca_type='aggregated', metric_calcs=metric_val, metric_trade_order_list=['trade_df'], results_form=results_form, force_calculate=True, reload_val=reload_val, trade_order_mapping=['trade_df']) calc_start = timeline_trade_df_metric_by_ticker.index[ 0] calc_end = timeline_trade_df_metric_by_ticker.index[ -1] aggregated_title = self.create_status_msg_flags( 'aggregated', ticker_val, calc_start, calc_end) logger.debug('Plotted aggregated summary plot!') finish = time.time() except Exception as e: LoggerManager().getLogger(__name__).exception(e) return "Status: error - " + str( e) + ". Check data exists for these dates?" except Exception as e: LoggerManager().getLogger(__name__).exception(e) return 'Status: error - ' + str( e) + ". Check data exists for these dates?" return 'Status: calculated ' + str(round( finish - start, 3)) + "s for " + aggregated_title elif tca_type == 'compliance': ticker_val, start_date_val, finish_date_val, broker_val, algo_val, venue_val, reload_val, market_data_val, \ filter_time_of_day_val, start_time_of_day_val, finish_time_of_day_val, slippage_bounds_val, visualization_val, n_clicks = args # Catch cases where users repeatedly click, which can cause misalignment in clicks self._session_manager.set_session_clicks(tag, n_clicks, old_clicks=old_clicks) logger.debug( self.create_generate_button_msg(old_clicks, n_clicks)) if ticker_val != '' and start_date_val != '' and broker_val != '' and algo_val != '' and venue_val != '' \ and finish_date_val != '' and reload_val != '' and filter_time_of_day_val != '' \ and start_time_of_day_val != '' and finish_time_of_day_val != '' and slippage_bounds_val != '' \ and n_clicks > old_clicks: ticker_val_list = self._util_func.populate_field( ticker_val, constants.available_tickers_dictionary) broker_val_list = self._util_func.populate_field( broker_val, constants.available_brokers_dictionary, exception_fields='All') algo_val_list = self._util_func.populate_field( algo_val, constants.available_algos_dictionary, exception_fields='All') venue_val_list = self._util_func.populate_field( venue_val, constants.available_venues_dictionary, exception_fields='All') logger.debug('Calculation click old: ' + str(old_clicks) + " clicks vs new " + str(n_clicks)) self._session_manager.set_session_clicks(tag, n_clicks) if visualization_val == 'yes': self._session_manager.set_session_flag( 'compliance-visualization', True) else: self._session_manager.set_session_flag( 'compliance-visualization', False) try: # if True: # Clear the cache for the current user self._glob_volatile_cache.clear_key_match( self._session_manager.get_session_id()) slippage_bounds = 0.0 overwrite_bid_ask = True if slippage_bounds_val == 'bid/ask': overwrite_bid_ask = False else: slippage_bounds = float(slippage_bounds_val) metric_calcs = [ # Calculate slippage for trades MetricSlippage(trade_order_list='trade_df'), ] benchmark_calcs = [ # Generate the spread to mid for market data (in certain case artificially create a spread) BenchmarkSpreadToMid( bid_mid_bp=slippage_bounds, ask_mid_bp=slippage_bounds, overwrite_bid_ask=overwrite_bid_ask) ] results_form = [ # Display a table of all the anomalous trades by slippage (ie. outside bid/ask) TableResultsForm( # Only display for trades trade_order_list=['trade_df'], # Display slippage metric_name='slippage', # Order by the worst slippage filter_by='worst_all', # Replace text on table to make it look nicer replace_text={ 'markout_': '', 'executed_notional': 'exec not', '_currency': ' cur', '_in_reporting': ' in rep', 'slippage_benchmark': 'benchmark', 'slippage_anomalous': 'anomalous', 'broker_id': 'broker ID', 'algo_id': 'algo ID', 'executed_price': 'price' }, exclude_fields_from_avg=[ 'slippage_anomalous', 'slippage_benchmark', 'side' ], # Only select trades outside bid/ask (ie. where slippage anomalous = 1) tag_value_combinations={ 'slippage_anomalous': 1.0 }, # Display several columns keep_fields=[ 'ticker', 'broker_id', 'algo_id', 'notional_currency', 'executed_notional', 'executed_notional_in_reporting_currency', 'side', 'executed_price' ], # Multiply slippage field by 10000 (to convert into basis points) scalar={'slippage': 10000.0}, # Round figures to make them easier to read round_figures_by={ 'executed_notional': 0, 'executed_notional_in_reporting_currency': 0, 'side': 0, 'slippage': 2, 'slippage_benchmark': 4 }), # Get the total notional executed by broker (in reporting currency) BarResultsForm( # Select child orders trade_order_list=['trade_df'], # Aggregate by broker name aggregate_by_field='broker_id', # Select the notional for analysis metric_name= 'executed_notional_in_reporting_currency', # analyse notional # Sum all the notionals aggregation_metric='sum', # Round figures round_figures_by=0) ] # Reformat tables for notional by broker join_tables = [ # JoinTables( # tables_dict={'table_name': 'jointables_broker_id_df', # # # fetch the following calculated tables # 'table_list': [ # 'bar_order_df_executed_notional_in_reporting_currency_by_broker_id'], # # # append to the columns of each table # 'column_list': ['notional (rep cur)'], # 'replace_text': {'broker_id': 'broker ID'} # }) ] try: # if True: trade_df = self.get_cached_computation_analysis( key='trade_df', start_date=start_date_val, finish_date=finish_date_val, start_time_of_day=start_time_of_day_val, finish_time_of_day=finish_time_of_day_val, filter_time_of_day=filter_time_of_day_val, event_type='trade', ticker=ticker_val_list, broker=broker_val_list, algo=algo_val_list, venue=venue_val_list, dummy_market=True, market_data=market_data_val, tca_engine=self._tca_engine, tca_type='compliance', metric_calcs=metric_calcs, benchmark_calcs=benchmark_calcs, metric_trade_order_list=['trade_df'], results_form=results_form, join_tables=join_tables, force_calculate=True, reload_val=reload_val, trade_order_mapping=['trade_df']) calc_start = trade_df.index[0] calc_end = trade_df.index[-1] compliance_title = self.create_status_msg_flags( 'compliance', ticker_val, calc_start, calc_end) logger.debug( 'Generated compliance summary.. awaiting plot callbacks!' ) finish = time.time() except Exception as e: logger.exception(e) return "Status: error " + str( e) + ". Check data exists for these dates?" except Exception as e: logger.exception(e) return 'Status: error ' + str( e) + ". Check data exists for these dates?" return 'Status: calculated ' + str(round( finish - start, 3)) + "s for " + compliance_title raise dash.exceptions.PreventUpdate( "No data changed - " + tca_type ) # Not very elegant but only way to prevent plots disappearing
def calculate_metrics_single_ticker(self, market_trade_order_combo, tca_request, dummy_market): """Calls auxillary methods to get market/trade data for a single ticker. If necessary splits up the request into smaller date chunks to collect market and trade data in parallel (using Celery) Parameters ---------- tca_request : TCARequest Parameter for the TCA analysis dummy_market : bool Should we put a dummy variable instead of returning market data Returns ------- DataFrame, DataFrameHolder, str """ trade_order_filter = tca_request.trade_order_filter benchmark_calcs = tca_request.benchmark_calcs metric_calcs = tca_request.metric_calcs ticker = tca_request.ticker logger = LoggerManager.getLogger(__name__) # Reassemble market and trade data from the tuple market_df, trade_order_df_dict = self.trim_sort_market_trade_order( market_trade_order_combo, tca_request.start_date, tca_request.finish_date, tca_request.ticker) # Calculate BenchmarkMarket's which only require market data and no trade data market_df = self.calculate_benchmark_market(market_df, tca_request) trade_order_df_values = [] trade_order_df_keys = [] # Calculations on trades with market data if len(trade_order_df_dict.keys()) > 0 and self._check_valid_market( market_df): # NOTE: this will not filter orders, only TRADES (as orders do not have venue parameters) logger.debug("Filter trades by venue") simple_filters = {'venue': tca_request.venue} if 'trade_df' in self._util_func.dict_key_list( trade_order_df_dict.keys()): for s in simple_filters.keys(): trade_order_df_dict[ 'trade_df'] = self._trade_order_tag.filter_trade_order( trade_order_df=trade_order_df_dict['trade_df'], tag_value_combinations={s: simple_filters[s]}) # Do additional more customised post-filtering of the trade/orders (eg. by broker_id, algo_id) if trade_order_filter is not None: for a in trade_order_filter: trade_order_df_dict = a.filter_trade_order_dict( trade_order_df_dict=trade_order_df_dict) # NOTE: this will not filter orders, only TRADES (as orders do not have event type parameters) simple_filters = {'event_type': tca_request.event_type} if 'trade_df' in self._util_func.dict_key_list( trade_order_df_dict.keys()): for s in simple_filters.keys(): trade_order_df_dict[ 'trade_df'] = self._trade_order_tag.filter_trade_order( trade_order_df=trade_order_df_dict['trade_df'], tag_value_combinations={s: simple_filters[s]}) # Remove any trade/orders which aren't empty t_remove = [] for t in trade_order_df_dict.keys(): if trade_order_df_dict[t] is None: t_remove.append(t) logger.warninging( t + " is empty.. might cause problems later!") elif trade_order_df_dict[t].empty: t_remove.append(t) logger.warninging( t + " is empty.. might cause problems later!") for t in t_remove: trade_order_df_dict.pop(t) trade_order_list = self._util_func.dict_key_list( trade_order_df_dict.keys()) # Check if we have any trades/orders left to analyse? if len(trade_order_list) == 0: logger.error("No trade/orders for " + ticker) else: # ok we have some trade/orders left to analyse if not (isinstance(trade_order_list, list)): trade_order_list = [trade_order_list] logger.debug("Calculating derived fields and benchmarks") logger.debug("Calculating execution fields") # Calculate derived executed fields for orders # can only do this if trade_df is also available if len(trade_order_df_dict.keys() ) > 1 and 'trade_df' in self._util_func.dict_key_list( trade_order_df_dict.keys()): # For the orders, calculate the derived fields for executed notional, trade etc. aggregated_notional_fields = 'executed_notional' # Calculate the derived fields of the orders from the trades # alao calculate any benchmarks for the orders for i in range(1, len(trade_order_list)): # NOTIONAL_EXECUTED: add derived field for executed price and notional executed for the orders trade_order_df_dict[trade_order_list[ i]] = self._metric_executed_price.calculate_metric( lower_trade_order_df=trade_order_df_dict[ trade_order_list[i - 1]], upper_trade_order_df=trade_order_df_dict[ trade_order_list[i]], aggregated_ids=constants.order_name + '_pointer_id', aggregated_notional_fields= aggregated_notional_fields, notional_reporting_currency_spot= 'notional_reporting_currency_mid')[0] # TODO not sure about this? if 'trade_df' in self._util_func.dict_key_list( trade_order_df_dict.keys()): if 'notional' not in trade_order_df_dict[ 'trade_df'].columns: trade_order_df_dict['trade_df'][ 'notional'] = trade_order_df_dict['trade_df'][ 'executed_notional'] logger.debug("Calculating benchmarks") # Calculate user specified benchmarks for each trade order (which has been selected) if benchmark_calcs is not None: for i in range(0, len(trade_order_df_dict)): for b in benchmark_calcs: # For benchmarks which need to be generated on a trade by trade basis (eg. VWAP, arrival etc) if not (isinstance(b, BenchmarkMarket)): logger.debug("Calculating " + type(b).__name__ + " for " + trade_order_list[i]) if trade_order_df_dict[ trade_order_list[i]] is not None: if not (trade_order_df_dict[ trade_order_list[i]].empty): trade_order_df_dict[trade_order_list[ i]], _ = b.calculate_benchmark( trade_order_df= trade_order_df_dict[ trade_order_list[i]], market_df=market_df, trade_order_name= trade_order_list[i]) logger.debug("Calculating metrics") # Calculate user specified metrics for each trade order (which has been selected) if metric_calcs is not None: for i in range(0, len(trade_order_df_dict)): for m in metric_calcs: logger.debug("Calculating " + type(m).__name__ + " for " + trade_order_list[i]) if trade_order_df_dict[ trade_order_list[i]] is not None: if not (trade_order_df_dict[ trade_order_list[i]].empty): trade_order_df_dict[trade_order_list[ i]], _ = m.calculate_metric( trade_order_df=trade_order_df_dict[ trade_order_list[i]], market_df=market_df, trade_order_name=trade_order_list[ i]) logger.debug("Completed derived field calculations for " + ticker) trade_order_df_dict = self._calculate_additional_metrics( market_df, trade_order_df_dict, tca_request) if dummy_market: market_df = None trade_order_df_keys = self._util_func.dict_key_list( trade_order_df_dict.keys()) trade_order_df_values = [] for k in trade_order_df_keys: trade_order_df_values.append(trade_order_df_dict[k]) # print("--- dataframes/keys ---") # print(trade_order_df_values) # print(trade_order_df_keys) return market_df, trade_order_df_values, ticker, trade_order_df_keys