class DatabasePopulator(ABC): """DatabasePopulator connects from one data source (typically an external one via a DatabaseSource eg. DatabaseNCFX) downloads historical data from that and then dumps it locally """ def __init__(self, temp_data_folder=constants.temp_data_folder, temp_large_data_folder=constants.temp_large_data_folder, tickers=None, data_store=None): self.temp_data_folder = temp_data_folder self.temp_large_data_folder = temp_large_data_folder self.tickers = None self.util_func = UtilFunc() self.time_series_ops = TimeSeriesOps() self.data_store = data_store logger = LoggerManager().getLogger(__name__) if not (os.path.isdir(self.temp_data_folder)): logger.warn("Temp data folder " + self.temp_data_folder + " does not exist") if not (os.path.isdir(self.temp_large_data_folder)): logger.warn("Temp large data folder " + self.temp_data_folder + " does not exist") if tickers is not None: self.tickers = tickers @abc.abstractmethod def _fetch_market_data(self, start, finish, ticker, web_proxies=constants.web_proxies): """Fetches market data in a single download for a ticker. We need to be careful not to specify chunks which are too large, as many external sources will have a limit on how much data we can download in one chunk. Parameters ---------- start : datetime Start date/time of the download finish : datetime Finish date/time of the download ticker : str Ticker to be downloaded web_proxies : dict Addresses for web proxies Returns ------- """ pass def _get_postfix(self): """The postfix which represents this data source, eg. 'ncfx' for New Change FX or 'dukascopy' for Dukascopy Returns ------- str """ pass @abc.abstractmethod def _get_output_data_source(self): """Gets the DatabaseSource object which represents how we wish to store the market data internally Returns ------- DatabaseSource """ return def _remove_weekend_points(self): return True @abc.abstractmethod def _get_input_data_source(self): """Gets the DatabaseSource object which represents how we input the market data (typically, this will be from an external data source) Returns ------- DatabaseSource """ return @abc.abstractmethod def _get_tickers(self): """List of tickers that can accessedd from the external/input DatabaseSource Returns ------- str (list) """ return @abc.abstractmethod def _get_threads(self, start_data_hist, finish_date_hist): """How many threads to use when downloading from our external/input DatabaseSource Returns ------- int """ return def download_to_csv(self, start_date, finish_date, tickers, remove_duplicates=True, split_size='monthly', chunk_int_min=None, include_partial_periods=False, write_temp_to_disk=True, write_large_csv=True, write_large_hdf5_parquet=True, csv_folder=constants.csv_folder, csv_compression=None, return_df=False, web_proxies=constants.web_proxies): start_date = self.time_series_ops.date_parse(start_date) finish_date = self.time_series_ops.date_parse(finish_date) dates = self.util_func.split_date_single_list( start_date, finish_date, split_size=split_size, add_partial_period_start_finish_dates=include_partial_periods) df_dict = {} msg = [] for i in range(0, len(dates) - 1): msg_list, df_dict_list = self.download_from_external_source( start_date=dates[i], finish_date=dates[i + 1], tickers=tickers, chunk_int_min=chunk_int_min, append_data=False, remove_duplicates=remove_duplicates, write_temp_to_disk=write_temp_to_disk, write_to_disk_db=False, write_large_csv=write_large_csv, write_large_hdf5_parquet=write_large_hdf5_parquet, csv_folder=csv_folder, csv_compression=csv_compression, return_df=return_df, web_proxies=web_proxies) if msg_list != []: msg.append(msg_list) if return_df: for k in df_dict_list.keys(): if k in df_dict.keys(): df_dict[k] = df_dict[k].append(df_dict_list[k]) else: df_dict[k] = df_dict_list[k] return self.util_func.flatten_list_of_lists(msg), df_dict def download_from_external_source(self, append_data=True, remove_duplicates=True, if_exists_table='append', if_exists_ticker='append', number_of_days=30 * 7, chunk_int_min=None, start_date=None, finish_date=None, delete_cached_files=False, tickers=None, write_temp_to_disk=True, write_to_disk_db=True, read_cached_from_disk=True, write_large_csv=False, write_large_hdf5_parquet=True, csv_folder=constants.csv_folder, csv_compression=None, return_df=False, web_proxies=constants.web_proxies): """Downloads market data from an external source and then dumps to HDF5/Parquet files for temporary storage which is cached. If HDF5/Parquet cached files already exist for a time segment we read them in, saving us to make an external data call. Lastly, dumps it to an internal database. Parameters ---------- append_data : bool True - only start collecting later data not already in database (ignoring number_of_days parameter) False - start collecting all data, ignoring anything stored in database remove_duplicates : bool True (default) - remove values which are repeated False - leave in repeated values if_exists_table : str 'append' - if database table already exists append data to it 'replace' - remove existing database table if_exists_ticker : str 'append' - if ticker already exists in the database, append to it 'replace' - replace any data for this ticker number_of_days : int Number of days to download data for chunk_int_min : int (None) Size of each download (default - specified in constants) Returns ------- """ # Swim() logger = LoggerManager.getLogger(__name__) if write_to_disk_db: data_source_local = self._get_output_data_source() if write_large_csv: if not (os.path.isdir(csv_folder)): logger.warn("CSV folder " + self.temp_data_folder + " where we are about to write does not exist") # What chunk size in minutes do we want for this data provider? if chunk_int_min is None: chunk_int_min = self._get_download_chunk_min_size() if chunk_int_min is None: chunk_size_str = None else: chunk_size_str = str(chunk_int_min) + "min" if tickers is None: tickers = self._get_tickers() if isinstance(tickers, str): tickers = [tickers] # If there's no start or finish date, choose a default start finish data if start_date is None and finish_date is None: finish_date = datetime.datetime.utcnow() finish_date = datetime.datetime(finish_date.year, finish_date.month, finish_date.day, 0, 0, 0, 0) start_date = finish_date - timedelta(days=number_of_days) # 30*7 else: start_date = self.time_series_ops.date_parse(start_date) finish_date = self.time_series_ops.date_parse(finish_date) if finish_date < start_date: logger.error("Download finish date is before start data!") return now = pd.Timestamp(datetime.datetime.utcnow(), tz='utc') # Do not allow downloading of future data! if finish_date > now: finish_date = now df_dict = {} # Loop through each ticker for ticker in tickers: has_old = False if delete_cached_files and write_to_disk_db: logger.info("Deleting all cached temp files for " + ticker) for name in glob.glob(self.temp_data_folder + '/*' + ticker + "*"): try: os.remove(name) except: logger.warn("Couldn't delete file " + name) logger.info("Finished deleting cached files for " + ticker) # If we have been asked to append data, load up what you can from the internal database # find the last point if append_data and if_exists_ticker == 'append' and write_to_disk_db: logger.info("Trying to download old data first for " + ticker) try: df_old = data_source_local.fetch_market_data( start_date, finish_date, ticker, web_proxies=web_proxies) # This will vary between tickers (in particular if we happen to add a new ticker) start_date = df_old.index[-1] has_old = True # Remove reference - big file! df_old = None except Exception as e: logger.info("No data found for ticker " + ticker + " with error: " + str(e)) else: logger.info("Downloading new data for " + ticker + ".") # Date range may not work with timezones start_date = pd.Timestamp(start_date.replace(tzinfo=None)) finish_date = pd.Timestamp(finish_date.replace(tzinfo=None)) if finish_date - start_date < pd.Timedelta(days=1): start_date_list = [start_date, finish_date] else: # download from that last point to the present day start_date_list = pd.date_range(start_date, finish_date) start_date_list = [ pd.Timestamp(x.to_pydatetime()) for x in start_date_list ] if finish_date > start_date_list[-1]: start_date_list.append(finish_date) df = None filename = os.path.join(self.temp_data_folder, ticker) + '.' + fileformat try: # df = UtilFunc().read_dataframe_from_hdf(filename) pass except: logger.info("Couldn't read HDF5/Parquet file for " + ticker) # Create downloads in x minute chunks (if we request very large chunks of data with certain data providers, # we could cause problems!) if df is None: df_remote_list = [] # Loop by day (otherwise can end up with too many open files!) for i in range(0, len(start_date_list) - 1): if chunk_size_str is not None: if start_date_list[ i + 1] - start_date_list[i] < pd.Timedelta( minutes=chunk_int_min): start_date_hist = [start_date_list[i]] finish_date_hist = [start_date_list[i + 1]] else: start_date_hist, finish_date_hist = UtilFunc( ).split_into_freq(start_date_list[i], start_date_list[i + 1], freq=chunk_size_str) else: start_date_hist = [start_date_list[i]] finish_date_hist = [start_date_list[i + 1]] # For FX and most other markets we should remove weekends (cryptocurrencies do have weekend data) if self._remove_weekend_points(): start_date_hist, finish_date_hist = UtilFunc( ).remove_weekend_points(start_date_hist, finish_date_hist) output = [] if constants.use_multithreading: # Create a multiprocess object for downloading data swim = Swim(parallel_library=constants. database_populator_threading_library) pool = swim.create_pool(thread_no=self._get_threads()) result = [] for i in range(0, len(start_date_hist)): # output.append(self._fetch_market_data(start_date_hist[i], finish_date_hist[i], ticker)) result.append( pool.apply_async( self._fetch_market_data, args=(start_date_hist[i], finish_date_hist[i], ticker, write_temp_to_disk, read_cached_from_disk, web_proxies))) output = [p.get() for p in result] swim.close_pool(pool, True) else: # Otherwise run in single threaded fashion for i in range(0, len(start_date_hist)): output.append( self._fetch_market_data( start_date_hist[i], finish_date_hist[i], ticker, write_to_disk=write_temp_to_disk, read_cached_from_disk=read_cached_from_disk, web_proxies=web_proxies)) # Get all the dataframe chunks and returned messages df_list = [ self._remove_duplicates_time_series(x, remove_duplicates, field='mid') for x, y in output if x is not None ] msg_list = [ y for x, y in output if x is not None and y is not None ] # Concatenate all the 5 (or larger) minute data chunks try: if df_list != []: df_temp = pd.concat(df_list) if df_temp is not None: if not (df_temp.empty): df_remote_list.append(df_temp) except Exception as e: logger.error(str(e)) if df_remote_list != []: df = pd.concat(df_remote_list) # Need to sort data (database assumes sorted data for chunking/searches) df = df.sort_index() df = self.time_series_ops.localize_as_UTC(df) if write_large_hdf5_parquet: if df is not None: if not (df.empty): key = '_' + self._get_postfix() + "_" + \ (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_') filename = os.path.join( csv_folder, ticker + key) + '.' + fileformat # Temporary cache for testing purposes (also if the process crashes, we can read this back in) UtilFunc().write_dataframe_to_binary( df, filename, format=binary_format) if df is not None: # Assume UTC time (don't want to mix UTC and non-UTC in database!) df = self.time_series_ops.localize_as_UTC(df) # write CSV if write_large_csv: if df is not None: if not (df.empty): key = '_' + self._get_postfix() + "_" + \ (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_') if csv_compression is 'gzip': df.to_csv(os.path.join(csv_folder, ticker + key + ".csv.gz"), compression='gzip') else: df.to_csv( os.path.join(csv_folder, ticker + key + ".csv")) if return_df: df_dict[ticker] = df # Dump what we have locally (or whatever DatabaseSource we have defined) try: start_date = start_date.replace(tzinfo=pytz.utc) # Remove first point if matches last point from dataset if has_old: if df.index[0] == start_date: df = df[-1:] if df is not None: df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, field='mid') if write_to_disk_db and df is not None: data_source_local.append_market_data( df, ticker, if_exists_table=if_exists_table, if_exists_ticker=if_exists_ticker) logger.info("Wrote to database for " + ticker) except Exception as e: final_err = "Data was missing for these dates " + str(start_date) + " - " + str(finish_date) + " for " \ + str(tickers) + " Didn't write anything to disk or return any valid dataframe: " + str(e) logger.error(final_err) if df is None: msg_list.append("No downloaded data for " + str(start_date) + " - " + str(finish_date) + ". Is this a holiday?") # Returns a status containing any failed downloads, which can be read by a user return msg_list, df_dict def _remove_duplicates_time_series(self, df, remove_duplicates, field='mid'): if remove_duplicates: df = self.time_series_ops.drop_consecutive_duplicates(df, field) return df def combine_mini_df_from_disk(self, tickers=None, remove_duplicates=True): """Combines the mini HDF5/Parquet files for eg. 5 min chunks and combine into a very large HDF5/Parquet file, which is likely to be for multiple months of data. Uses multithreading to speed up, by using a thread for each different ticker. Parameters ---------- tickers : str (list or ditc) Ticker of each ticker remove_duplicates : bool Remove duplicated market prices, which follow one another Returns ------- """ if tickers is None: tickers = self.tickers.keys() if isinstance(tickers, dict): tickers = tickers.keys() if not (isinstance(tickers, list)): tickers = [tickers] if constants.use_multithreading: swim = Swim(parallel_library=constants. database_populator_threading_library) pool = swim.create_pool(thread_no=self._get_threads()) result = [] for i in range(0, len(tickers)): result.append( pool.apply_async( self._combine_mini_df_from_disk_single_thread, args=( tickers[i], remove_duplicates, ))) output = [p.get() for p in result] swim.close_pool(pool, True) else: for i in range(0, len(tickers)): self._combine_mini_df_from_disk_single_thread( tickers[i], remove_duplicates) def _combine_mini_df_from_disk_single_thread(self, ticker, remove_duplicates=True): logger = LoggerManager.getLogger(__name__) time_series_ops = TimeSeriesOps() logger.info('Getting ' + ticker + ' filenames...') temp_data_folder = self.temp_data_folder filename_list = [] for root, dirnames, filenames in os.walk(temp_data_folder): for filename in filenames: if ticker in filename and '.' + fileformat in filename: filename_h5_parquet = os.path.join(root, filename) # if filename is less than 10MB add (otherwise likely a very large aggregated file!) if os.path.getsize(filename_h5_parquet) < 10 * 1024 * 1024: filename_list.append(filename_h5_parquet) df_list = [] util_func = UtilFunc() logger.info('Loading ' + ticker + ' mini dataframe into memory') i = 0 if len(filename_list) == 0: logger.warn("Looks like there are no files for " + ticker + " in " + temp_data_folder + ". Are you sure path is correct?") # Go through each mini file which represents a few minutes of data and append it for filename in filename_list: filesize = 0 try: filesize = os.path.getsize(filename) / 1024.0 df = util_func.read_dataframe_from_binary(filename, format=binary_format) i = i + 1 # every 100 files print reading output@ if i % 100 == 0: logger.info('Reading ' + filename + ' number ' + str(i)) if df is not None: df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, time_series_ops, field='mid') df_list.append(df) except Exception as e: logger.warn('Failed to parse ' + filename + " of " + str(filesize) + "KB") # + str(e)) # if i > 1000: # break # Assume UTC time (don't want to mix UTC and non-UTC in database!) if df_list == []: logger.warn('No dataframe read for ' + ticker + ', cannot combine!') return logger.info('About to combine ' + ticker + ' into large dataframe to write to disk...') df = pd.concat(df_list) df = time_series_ops.localize_as_UTC(df) df = df.sort_index() df = self._remove_duplicates_time_series(df, remove_duplicates, time_series_ops, field='mid') postfix = '-' + self._get_postfix() + '-with-duplicates' if remove_duplicates: postfix = '-' + self._get_postfix() + '-no-duplicates' filename = os.path.join(self.temp_large_data_folder, ticker + postfix) + '.' + fileformat df = time_series_ops.localize_as_UTC(df) util_func.write_dataframe_to_binary(df, filename, format=binary_format) def write_df_to_db(self, tickers=None, remove_duplicates=True, if_exists_table='append', if_exists_ticker='replace'): """Loads up a large HDF5/Parquet file from disk into a pd DataFrame and then dumps locally. Uses multithreading to speed it up, by using a thread for each different ticker. Parameters ---------- tickers : str (list or dict) List of tickers remove_duplicates : bool True (default) - removes any follow on duplicates in the dataset if_exists_table : str 'append' - if database table already exists append data to it 'replace' - remove existing database table if_exists_ticker : str 'append' - if ticker already exists in the database, append to it 'replace' - replace any data for this ticker Returns ------- """ if tickers is None: tickers = self.tickers.keys() if isinstance(tickers, dict): tickers = tickers.keys() if not (isinstance(tickers, list)): tickers = [tickers] if constants.use_multithreading: swim = Swim(parallel_library=constants. database_populator_threading_library) pool = swim.create_pool(thread_no=self._get_threads()) result = [] for i in range(0, len(tickers)): result.append( pool.apply_async(self._write_df_to_db_single_thread, args=( tickers[i], remove_duplicates, if_exists_table, if_exists_ticker, ))) output = [p.get() for p in result] swim.close_pool(pool, True) else: for i in range(0, len(tickers)): self._write_df_to_db_single_thread(tickers[i], remove_duplicates, if_exists_table, if_exists_ticker) def _write_df_to_db_single_thread(self, ticker, remove_duplicates=True, if_exists_table='append', if_exists_ticker='replace'): logger = LoggerManager.getLogger(__name__) postfix = '-' + self._get_postfix() + '-with-duplicates' if remove_duplicates: postfix = '-' + self._get_postfix() + '-no-duplicates' filename = os.path.join(self.temp_large_data_folder, ticker + postfix) + '.' + fileformat logger.info("Reading " + filename) util_func = UtilFunc() time_series_ops = TimeSeriesOps() data_source_local = self._get_output_data_source() df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: df = time_series_ops.localize_as_UTC(df) data_source_local.append_market_data( df, ticker, if_exists_table=if_exists_table, if_exists_ticker=if_exists_ticker) else: logger.warn("Couldn't write dataframe for " + ticker + " to database, appears it is empty!") def _remove_saturday(self): return True
class SessionManager(object): """Manages the caching of properties for a user's session. We use this extensively, to identify users and also to store variables relating to users on the server side. It is used for example, for keeping track of which lines have plotted, user's zoom actions, whether tcapy has already plotted a particular dataset etc. """ def __init__(self): self._util_func = UtilFunc() # session ID management functions def get_session_id(self): """Gets the current user's session ID and generates a unique one if necessary. Returns ------- str """ if 'id' not in session: id = str(uuid.uuid4()) username = self.get_username() if username is not None: username = '******' + username else: username = '' session['id'] = id + username else: id = session['id'] if not isinstance(id, str): id = id.decode("utf-8") return id def get_username(self): header = flask.request.headers.get('Authorization', None) if not header: return None username_password = base64.b64decode(header.split('Basic ')[1]) username_password_utf8 = username_password.decode('utf-8') username, password = username_password_utf8.split(':') return username def set_session_flag(self, tag, value=None): """Sets a value with a specific tag in the session dictionary, which is essentially unique for every user. Parameters ---------- tag : str (dict) The "hash key" for our variable value : str What to set the value in our hash table Returns ------- """ if isinstance(tag, str): tag = [tag] if isinstance(tag, dict): for t in tag: self.set_session_flag(t, value=tag[t]) return tag = self._util_func.flatten_list_of_lists(tag) for t in tag: session[t] = value def get_session_flag(self, tag): """Gets the value of a tag in the user's session Parameters ---------- tag : str Tag to be fetched Returns ------- str """ if tag in session: if isinstance(session[tag], bool): return session[tag] return str(session[tag]) return None ##### these methods are for keeping track of which lines, user zooms have been plotted for each chart in the user's ##### session object def check_lines_plotted(self, lines_to_plot, tag): """Checks if the lines have been plotted for a particular user, by checking the plot's tag in their user session Parameters ---------- lines_to_plot : str (list) Lines to be plotted tag : str Tag of plotted lines Returns ------- bool """ if tag in session: lines_plotted = session[tag] if set(lines_to_plot) == set(lines_plotted): return True return False def check_relayoutData_plotted(self, relayoutData, tag): """Checks if the relayout data (ie. related to user's clicks, such as when they zoom in) has already been plotted. Parameters ---------- relayoutData : dict tag : str Tag referring to a particular plot Returns ------- """ if tag in session: # relayoutDataSet = None # sessionTagSet = None # # if relayoutData is not None: # relayoutDataSet = set(relayoutData) # # if session[tag] is not None: # sessionTagSet = set(session[tag]) # if relayoutData is None: # return False if relayoutData == session[tag]: return True return False def set_lines_plotted(self, lines_to_plot, tag): """Sets the lines plotted for a particular chart tag in the user's session Parameters ---------- lines_to_plot : str (list) Lines plotted tag : str Tag of the plot Returns ------- """ session[tag] = lines_to_plot def set_relayoutData_plotted(self, relayoutData, tag): """Sets the user's clicks (typically for zooming into charts) for a particular chart Parameters ---------- relayoutData : dict Details a user's click on the chart tag : str Tag referring to the plot Returns ------- """ session[tag] = relayoutData def set_username(self, username): session['username'] = username ##### We identify when a user has "clicked" a button by change in the number of clicks (Dash documentation recommends ##### this to handle user clicks) def get_session_clicks(self, tag): """Gets the number of clicks for the tag. If doesn't exist, we automatically set the tag as 0. Parameters ---------- tag : str The tag for which we want to return the number of clicks Returns ------- Number of clicks by current user """ if tag not in session: return 0 return session[tag] def set_session_clicks(self, tag, n_clicks, old_clicks=None): """Sets the number of clicks in the current user's session Parameters ---------- tag : str Tag to store the user's clicks under n_clicks : int Number of clicks to set Returns ------- """ if old_clicks is None: session[tag] = n_clicks elif old_clicks > n_clicks: session[tag] = n_clicks def check_session_tag(self, tag): """Checks if a tag exists in the user's session, and if so returns the value of that tag in the user's session Parameters ---------- tag : str Tag to check Returns ------- str or bool """ if tag in session: return session[tag] return False def exists_session_tag(self, tag): """Does a tag exist in the current user session? Parameters ---------- tag : str Returns ------- bool """ return tag in session def check_session_reset_tag(self, tag): """Checks if a tag is in session (if that tag exists already and is "True", then we reset it to "False"), otherwise return "False" Parameters ---------- tag : str Tags to check Returns ------- bool """ if tag in session: old_tag = session[tag] if old_tag: session[tag] = False return True return False return False def create_calculated_flags(self, prefix, lst=None, lst2=None): """Creates a list for a combination of prefix and list elements. Parameters ---------- prefix : str Prefix (typically a page name like 'detailed') lst : str (list) Tags will contain these lst2 : str (list) Tags will contain these Returns ------- str (list) """ if isinstance(prefix, list): prefix = self._util_func.flatten_list_of_lists(prefix) lst = [x + '-' + lst for x in prefix] elif isinstance(lst, list): lst = self._util_func.flatten_list_of_lists(lst) lst = [prefix + '-' + x for x in lst] if lst2 is None: return lst lst3 = [] for i in lst2: for j in lst: lst3.append(j + '-' + i) return lst3
class TCAMarketTradeLoader(ABC): """TCAMarketTradeLoader provides wrapper methods to load market and trade data and also allows adds additional calculated fields to the trade data such as metrics (slippage, market impact etc), benchmarks (mid, VWAP, TWAP etc.) etc. as well as ways to process this output for display for multiple tickers. Underneath it uses TCATickerLoader, for fetching data/calculating metrics for individual tickers. Typically, TCAMarketTradeLoader will be called by an instance of TCAEngine. However, it can also be called directly, if we simply want to download market or trade data by itself. """ def __init__(self, version=constants.tcapy_version): self._util_func = UtilFunc( ) # general utility operations (such as flatten lists) self._trade_order_tag = TradeOrderFilterTag( ) # to filter trade/orders according to the values of certain tags self._version = version def get_market_data(self, market_request): """Gets market data for tickers. When we ask for non-standard FX crosses, only the mid-field is returned (calculated as a cross rate). We do not give bid/ask quotes for calculated non-standard tickers, as these can difficult to estimate. Parameters ---------- market_request : MarketRequest The type of market data to get Returns ------- DataFrame """ tca_ticker_loader = Mediator.get_tca_ticker_loader( version=self._version) if isinstance(market_request.ticker, list): if len(market_request.ticker) > 1: market_request_list = self._split_tca_request_into_list( market_request) market_df_dict = {} for market_request_single in market_request_list: market_df_dict[market_request.ticker] = \ tca_ticker_loader(version=self._version).get_market_data(market_request_single) return tca_ticker_loader.get_market_data(market_request) def get_trade_order_data(self, tca_request, trade_order_type): """Gets trade data for specified parameters (eg. start/finish dates tickers). Will also try to find trades when they have booked in the inverted market convention, and change the fields appropriately. For example, if we ask for GBPUSD trade data, it will also search for USDGBP and convert those trades in the correct convention. Parameters ---------- tca_request : TCARequest What type of trade data do we want trade_order_type : str Do we want trade or order data? Returns ------- DataFrame """ tca_ticker_loader = Mediator.get_tca_ticker_loader( version=self._version) if isinstance(tca_request.ticker, list): if len(tca_request.ticker) > 1: tca_request_list = self._split_tca_request_into_list( tca_request) trade_df_list = [] for tca_request_single in tca_request_list: trade_df_list.append( tca_ticker_loader( version=self._version).get_trade_order_data( tca_request_single, trade_order_type)) df_dict = tca_ticker_loader.get_trade_order_data( tca_request, trade_order_type) return df_dict def get_trade_order_holder(self, tca_request): """Gets the trades/orders in the form of a TradeOrderHolder Parameters ---------- tca_request : TCARequest Parameters for the TCA computation Returns ------- TradeOrderHolder """ tca_ticker_loader = Mediator.get_tca_ticker_loader( version=self._version) if isinstance(tca_request.ticker, list): if len(tca_request.ticker) > 1: tca_request_list = self._split_tca_request_into_list( tca_request) trade_order_holder = DataFrameHolder() for tca_request_single in tca_request_list: trade_order_holder.add_dataframe_holder( tca_ticker_loader(version=self._version). get_trade_order_holder(tca_request_single)) return tca_ticker_loader( version=self._version).get_trade_order_holder(tca_request) def get_market_trade_order_holder(self, tca_request): """Gets the both the market data and trade/order data associated with a TCA calculation as a tuple of (DataFrame, DataFrameHolder) Parameters ---------- tca_request : TCARequest Parameters for a TCA calculation Returns ------- DataFrame, DataFrameHolder """ return Mediator.get_tca_ticker_loader( version=self._version).get_market_trade_order_holder(tca_request) def load_market_calculate_summarize_metrics(self, tca_request, dummy_market=False): """Splits up the TCA request into individual tickers. Market/trade data is loaded for each ticker, before conducting TCA (ie. calculating metrics, benchmarks etc.). Returns a dictionary consisting of market data and another dictionary of trade/order data (and any additional results associated with the TCA) Parameters ---------- tca_request : TCARequest Parameters defining the TCA calculation dummy_market : bool, default False Do we return market data for future use? Returns ------- DataFrame (dict), DataFrame (dict) """ # Load market/trade data and compute metrics/benchmarks etc. per ticker market_df_dict, trade_order_results_df_dict, tca_request_list = \ self.get_market_trade_metrics(tca_request, dummy_market=dummy_market) # If every ticker we have selected doesn't have trades (and are analysis also requires trades), can't do any TCA at all if len(trade_order_results_df_dict) == 0 and tca_request.trade_data_store is not None \ and tca_request.trade_order_mapping is None: logger = LoggerManager.getLogger(__name__) err_msg = "no trade data for specified ticker(s) and time range" logger.error(err_msg) raise DataMissingException(err_msg) # trade_df = trade_order_results_df_dict['trade_df'] # Now summarize those metrics across all the tickers, for easier display return self.summarize_metrics(market_df_dict, trade_order_results_df_dict, tca_request_list, dummy_market=dummy_market) def summarize_metrics(self, market_df_dict, trade_order_results_df_dict, tca_request_list, dummy_market=False): """Takes in precomputed metrics across one or more tickers, and summarizes them for later user display (should be customised for users) Parameters ---------- tca_request_list : TCARequest (list) List of TCARequests (typically, each is for a different ticker) dummy_market : bool Should we output market data for later use? Returns ------- DataFrame, DataFrame, DataFrame """ # Allow user defined summary of metrics trade_order_results_df_dict = self._apply_summary_metrics( tca_request_list, trade_order_results_df_dict, market_df_dict) # Warning: do not ask for market data when requesting more than one ticker, could cause memory leaks! if (dummy_market): return None, trade_order_results_df_dict # Dictionary of market data and a dictionary of trades/orders/results of TCA analysis # TODO convert into strings return market_df_dict, trade_order_results_df_dict def _apply_summary_metrics(self, tca_request_list, trade_order_results_df_dict, market_df_dict): return trade_order_results_df_dict def get_market_trade_metrics(self, tca_request, dummy_market=False): """Collects together all the market and trade data (and computes metrics) for each ticker specified in the TCARequest Parameters ---------- tca_request : TCARequest Parameters for the TCA dummy_market : bool (default: False) Should dummy market data be returned (requires less memory)? Returns ------- DataFrame (dict) , DataFrame (dict), TCARequest (list) """ logger = LoggerManager.getLogger(__name__) logger.debug("Start loading trade/data/computation") # split up TCARequest into a list of TCA with different tickers tca_request_list = self._split_tca_request_into_list(tca_request) market_df_dict, trade_order_results_df_dict = self._get_market_trade_metrics( tca_request_list, dummy_market) logger.debug( "Finished loading data and calculating metrics on individual tickers" ) return market_df_dict, trade_order_results_df_dict, tca_request_list def _get_market_trade_metrics(self, tca_request_list, dummy_market): """Gets the market and trade data, as well as computed metrics on them Parameters ---------- tca_request_list : TCARequest (list) Requests for multiple TCARequests (eg. for different tickers) dummy_market : bool Return dummy market data? Returns ------- DataFrame (dict), DataFrame (dict) """ tca_ticker_loader = Mediator.get_tca_ticker_loader( version=self._version) market_df_dict = {} trade_order_holder_list = DataFrameHolder() for tca_request_single in tca_request_list: market_df, trade_order_df_dict = tca_ticker_loader.get_market_trade_order_holder( tca_request_single) market_df, trade_order_df_list, ticker, trade_order_keys = \ tca_ticker_loader.calculate_metrics_single_ticker((market_df, trade_order_df_dict), tca_request_single, dummy_market) market_df_dict[ticker] = market_df trade_order_holder_list.add_dataframe_dict( dict(zip(trade_order_keys, trade_order_df_list))) # Unpack the DataFrameHolder into a dictionary (combining the lists of trade, orders etc. into single dataframes) # this may also decompress the trades trade_order_results_df_dict = trade_order_holder_list.get_combined_dataframe_dict( ) return market_df_dict, trade_order_results_df_dict def _split_tca_request_into_list(self, tca_request): """Splits a TCA request by ticker. Parameters ---------- tca_request : TCARequest TCA request to broken up into tickers Returns ------- TCARequest(list) """ ticker = tca_request.ticker if not (isinstance(ticker, list)): ticker = [ticker] tca_request_list = [] # go through every ticker (and also split into list) for tick in ticker: tca_request_temp = TCARequest(tca_request=tca_request) tca_request_temp.ticker = tick tca_request_list.append(tca_request_temp) return self._util_func.flatten_list_of_lists(tca_request_list) @abc.abstractmethod def get_tca_version(self): pass
class TableResultsForm(ResultsForm): """Takes in trade/orders and then creates aggregated metrics which are likely to be displayed as a table. Can also sort by best/worst metrics, rounding numbers etc. """ def __init__(self, trade_order_list=None, metric_name=None, filter_by=['all'], tag_value_combinations={}, keep_fields=['executed_notional', 'side'], replace_text={}, round_figures_by=1, scalar=1.0, weighting_field=constants.table_weighting_field, exclude_fields_from_avg=[]): self._trade_order_list = trade_order_list self._metric_name = metric_name self._results_summary = ResultsSummary() self._keep_fields = keep_fields self._filter_by = filter_by self._replace_text = replace_text self._round_figures_by = round_figures_by self._weighting_field = weighting_field self._scalar = scalar self._exclude_fields_from_avg = exclude_fields_from_avg self._tag_value_combinations = tag_value_combinations self._trade_order_filter_tag = TradeOrderFilterTag() self._results_form_tag = 'table' self._util_func = UtilFunc() self._time_series_ops = TimeSeriesOps() def aggregate_results(self, trade_order_df=None, market_df=None, filter_by=[], trade_order_name=None, metric_name=None, ticker=None, filter_nan=True, weighting_field=None, tag_value_combinations={}, keep_fields=[], remove_fields=[], replace_text={}, round_figures_by=None, scalar=None, exclude_fields_from_avg=None): if not (self._check_calculate_results(trade_order_name)): return [None, None] if metric_name is None: metric_name = self._metric_name if keep_fields == []: keep_fields = self._keep_fields if filter_by == []: filter_by = self._filter_by if round_figures_by is None: round_figures_by = self._round_figures_by if replace_text == {}: replace_text = self._replace_text if weighting_field is None: weighting_field = self._weighting_field if tag_value_combinations == {}: tag_value_combinations = self._tag_value_combinations if scalar is None: scalar = self._scalar if exclude_fields_from_avg is None: exclude_fields_from_avg = self._exclude_fields_from_avg if not (isinstance(metric_name, list)): metric_name = [metric_name] if not (isinstance(filter_by, list)): filter_by = [filter_by] trade_order_df = self._trade_order_filter_tag.filter_trade_order( trade_order_df, tag_value_combinations=tag_value_combinations) results = [] for filt in filter_by: for met in metric_name: if met not in trade_order_df.columns: results.append(None) elif weighting_field is not None and weighting_field not in trade_order_df.columns: results.append(None) else: metric_fields_to_filter = [ x for x in trade_order_df.columns if met in x ] columns_to_keep = self._util_func.flatten_list_of_lists( [keep_fields, metric_fields_to_filter]) results_df = trade_order_df[columns_to_keep] # Apply filter if 'worst' in filt: ordinal = filt.split('worst_')[1] results_df = results_df.sort_values(by=met, ascending=True) if ordinal != 'all': results_df = results_df.head(int(ordinal)) elif 'best' in filt: ordinal = filt.split('worst_')[1] results_df = results_df.sort_values(by=met, ascending=False) if ordinal != 'all': results_df = results_df.head(ordinal) # Weighting field for average! results_df = self._time_series_ops.weighted_average_of_each_column( results_df, weighting_field, append=True, exclude_fields_from_avg=exclude_fields_from_avg) results_df = self._time_series_ops.multiply_scalar_dataframe( results_df, scalar=scalar) results_df = self._time_series_ops.round_dataframe( results_df, round_figures_by, columns_to_keep=columns_to_keep) results_df = self._util_func.replace_text_in_cols( results_df, replace_text) results.append( (results_df, self._results_form_tag + '_' + trade_order_name + '_' + met + '_by_' + filt)) return results