Ejemplo n.º 1
0
class DatabasePopulator(ABC):
    """DatabasePopulator connects from one data source (typically an external one via a DatabaseSource eg. DatabaseNCFX)
    downloads historical data from that and then dumps it locally
    """
    def __init__(self,
                 temp_data_folder=constants.temp_data_folder,
                 temp_large_data_folder=constants.temp_large_data_folder,
                 tickers=None,
                 data_store=None):

        self.temp_data_folder = temp_data_folder
        self.temp_large_data_folder = temp_large_data_folder
        self.tickers = None
        self.util_func = UtilFunc()
        self.time_series_ops = TimeSeriesOps()
        self.data_store = data_store

        logger = LoggerManager().getLogger(__name__)

        if not (os.path.isdir(self.temp_data_folder)):
            logger.warn("Temp data folder " + self.temp_data_folder +
                        " does not exist")

        if not (os.path.isdir(self.temp_large_data_folder)):
            logger.warn("Temp large data folder " + self.temp_data_folder +
                        " does not exist")

        if tickers is not None:
            self.tickers = tickers

    @abc.abstractmethod
    def _fetch_market_data(self,
                           start,
                           finish,
                           ticker,
                           web_proxies=constants.web_proxies):
        """Fetches market data in a single download for a ticker. We need to be careful not to specify chunks which are
        too large, as many external sources will have a limit on how much data we can download in one chunk.

        Parameters
        ----------
        start : datetime
            Start date/time of the download

        finish : datetime
            Finish date/time of the download

        ticker : str
            Ticker to be downloaded

        web_proxies : dict
            Addresses for web proxies

        Returns
        -------

        """
        pass

    def _get_postfix(self):
        """The postfix which represents this data source, eg. 'ncfx' for New Change FX or 'dukascopy' for Dukascopy

        Returns
        -------
        str
        """
        pass

    @abc.abstractmethod
    def _get_output_data_source(self):
        """Gets the DatabaseSource object which represents how we wish to store the market data internally

        Returns
        -------
        DatabaseSource
        """
        return

    def _remove_weekend_points(self):
        return True

    @abc.abstractmethod
    def _get_input_data_source(self):
        """Gets the DatabaseSource object which represents how we input the market data (typically, this will be from
        an external data source)

        Returns
        -------
        DatabaseSource
        """
        return

    @abc.abstractmethod
    def _get_tickers(self):
        """List of tickers that can accessedd from the external/input DatabaseSource

        Returns
        -------
        str (list)
        """
        return

    @abc.abstractmethod
    def _get_threads(self, start_data_hist, finish_date_hist):
        """How many threads to use when downloading from our external/input DatabaseSource

        Returns
        -------
        int
        """
        return

    def download_to_csv(self,
                        start_date,
                        finish_date,
                        tickers,
                        remove_duplicates=True,
                        split_size='monthly',
                        chunk_int_min=None,
                        include_partial_periods=False,
                        write_temp_to_disk=True,
                        write_large_csv=True,
                        write_large_hdf5_parquet=True,
                        csv_folder=constants.csv_folder,
                        csv_compression=None,
                        return_df=False,
                        web_proxies=constants.web_proxies):

        start_date = self.time_series_ops.date_parse(start_date)
        finish_date = self.time_series_ops.date_parse(finish_date)

        dates = self.util_func.split_date_single_list(
            start_date,
            finish_date,
            split_size=split_size,
            add_partial_period_start_finish_dates=include_partial_periods)

        df_dict = {}
        msg = []

        for i in range(0, len(dates) - 1):
            msg_list, df_dict_list = self.download_from_external_source(
                start_date=dates[i],
                finish_date=dates[i + 1],
                tickers=tickers,
                chunk_int_min=chunk_int_min,
                append_data=False,
                remove_duplicates=remove_duplicates,
                write_temp_to_disk=write_temp_to_disk,
                write_to_disk_db=False,
                write_large_csv=write_large_csv,
                write_large_hdf5_parquet=write_large_hdf5_parquet,
                csv_folder=csv_folder,
                csv_compression=csv_compression,
                return_df=return_df,
                web_proxies=web_proxies)

            if msg_list != []:
                msg.append(msg_list)

            if return_df:
                for k in df_dict_list.keys():
                    if k in df_dict.keys():
                        df_dict[k] = df_dict[k].append(df_dict_list[k])
                    else:
                        df_dict[k] = df_dict_list[k]

        return self.util_func.flatten_list_of_lists(msg), df_dict

    def download_from_external_source(self,
                                      append_data=True,
                                      remove_duplicates=True,
                                      if_exists_table='append',
                                      if_exists_ticker='append',
                                      number_of_days=30 * 7,
                                      chunk_int_min=None,
                                      start_date=None,
                                      finish_date=None,
                                      delete_cached_files=False,
                                      tickers=None,
                                      write_temp_to_disk=True,
                                      write_to_disk_db=True,
                                      read_cached_from_disk=True,
                                      write_large_csv=False,
                                      write_large_hdf5_parquet=True,
                                      csv_folder=constants.csv_folder,
                                      csv_compression=None,
                                      return_df=False,
                                      web_proxies=constants.web_proxies):
        """Downloads market data from an external source and then dumps to HDF5/Parquet files for temporary storage which is cached.
        If HDF5/Parquet cached files already exist for a time segment we read them in, saving us to make an external data call.

        Lastly, dumps it to an internal database.

        Parameters
        ----------
        append_data : bool
            True - only start collecting later data not already in database (ignoring number_of_days parameter)
            False - start collecting all data, ignoring anything stored in database

        remove_duplicates : bool
            True (default) - remove values which are repeated
            False - leave in repeated values

        if_exists_table : str
            'append' - if database table already exists append data to it
            'replace' - remove existing database table

        if_exists_ticker : str
            'append' - if ticker already exists in the database, append to it
            'replace' - replace any data for this ticker

        number_of_days : int
            Number of days to download data for

        chunk_int_min : int (None)
            Size of each download (default - specified in constants)

        Returns
        -------

        """
        # Swim()

        logger = LoggerManager.getLogger(__name__)

        if write_to_disk_db:
            data_source_local = self._get_output_data_source()

        if write_large_csv:
            if not (os.path.isdir(csv_folder)):
                logger.warn("CSV folder " + self.temp_data_folder +
                            " where we are about to write does not exist")

        # What chunk size in minutes do we want for this data provider?
        if chunk_int_min is None:
            chunk_int_min = self._get_download_chunk_min_size()

        if chunk_int_min is None:
            chunk_size_str = None
        else:
            chunk_size_str = str(chunk_int_min) + "min"

        if tickers is None:
            tickers = self._get_tickers()

        if isinstance(tickers, str):
            tickers = [tickers]

        # If there's no start or finish date, choose a default start finish data
        if start_date is None and finish_date is None:
            finish_date = datetime.datetime.utcnow()
            finish_date = datetime.datetime(finish_date.year,
                                            finish_date.month, finish_date.day,
                                            0, 0, 0, 0)

            start_date = finish_date - timedelta(days=number_of_days)  # 30*7
        else:
            start_date = self.time_series_ops.date_parse(start_date)
            finish_date = self.time_series_ops.date_parse(finish_date)

        if finish_date < start_date:
            logger.error("Download finish date is before start data!")

            return

        now = pd.Timestamp(datetime.datetime.utcnow(), tz='utc')

        # Do not allow downloading of future data!
        if finish_date > now:
            finish_date = now

        df_dict = {}

        # Loop through each ticker
        for ticker in tickers:

            has_old = False

            if delete_cached_files and write_to_disk_db:
                logger.info("Deleting all cached temp files for " + ticker)

                for name in glob.glob(self.temp_data_folder + '/*' + ticker +
                                      "*"):
                    try:
                        os.remove(name)
                    except:
                        logger.warn("Couldn't delete file " + name)

                logger.info("Finished deleting cached files for " + ticker)

            # If we have been asked to append data, load up what you can from the internal database
            # find the last point
            if append_data and if_exists_ticker == 'append' and write_to_disk_db:
                logger.info("Trying to download old data first for " + ticker)

                try:
                    df_old = data_source_local.fetch_market_data(
                        start_date,
                        finish_date,
                        ticker,
                        web_proxies=web_proxies)

                    # This will vary between tickers (in particular if we happen to add a new ticker)
                    start_date = df_old.index[-1]

                    has_old = True

                    # Remove reference - big file!
                    df_old = None

                except Exception as e:
                    logger.info("No data found for ticker " + ticker +
                                " with error: " + str(e))
            else:
                logger.info("Downloading new data for " + ticker + ".")

            # Date range may not work with timezones
            start_date = pd.Timestamp(start_date.replace(tzinfo=None))
            finish_date = pd.Timestamp(finish_date.replace(tzinfo=None))

            if finish_date - start_date < pd.Timedelta(days=1):
                start_date_list = [start_date, finish_date]
            else:
                # download from that last point to the present day
                start_date_list = pd.date_range(start_date, finish_date)

                start_date_list = [
                    pd.Timestamp(x.to_pydatetime()) for x in start_date_list
                ]

                if finish_date > start_date_list[-1]:
                    start_date_list.append(finish_date)

            df = None
            filename = os.path.join(self.temp_data_folder,
                                    ticker) + '.' + fileformat

            try:
                # df = UtilFunc().read_dataframe_from_hdf(filename)
                pass
            except:
                logger.info("Couldn't read HDF5/Parquet file for " + ticker)

            # Create downloads in x minute chunks (if we request very large chunks of data with certain data providers,
            # we could cause problems!)
            if df is None:
                df_remote_list = []

                # Loop by day (otherwise can end up with too many open files!)
                for i in range(0, len(start_date_list) - 1):

                    if chunk_size_str is not None:
                        if start_date_list[
                                i + 1] - start_date_list[i] < pd.Timedelta(
                                    minutes=chunk_int_min):
                            start_date_hist = [start_date_list[i]]
                            finish_date_hist = [start_date_list[i + 1]]
                        else:
                            start_date_hist, finish_date_hist = UtilFunc(
                            ).split_into_freq(start_date_list[i],
                                              start_date_list[i + 1],
                                              freq=chunk_size_str)
                    else:
                        start_date_hist = [start_date_list[i]]
                        finish_date_hist = [start_date_list[i + 1]]

                    # For FX and most other markets we should remove weekends (cryptocurrencies do have weekend data)
                    if self._remove_weekend_points():
                        start_date_hist, finish_date_hist = UtilFunc(
                        ).remove_weekend_points(start_date_hist,
                                                finish_date_hist)

                    output = []

                    if constants.use_multithreading:

                        # Create a multiprocess object for downloading data
                        swim = Swim(parallel_library=constants.
                                    database_populator_threading_library)
                        pool = swim.create_pool(thread_no=self._get_threads())

                        result = []

                        for i in range(0, len(start_date_hist)):
                            # output.append(self._fetch_market_data(start_date_hist[i], finish_date_hist[i], ticker))

                            result.append(
                                pool.apply_async(
                                    self._fetch_market_data,
                                    args=(start_date_hist[i],
                                          finish_date_hist[i], ticker,
                                          write_temp_to_disk,
                                          read_cached_from_disk, web_proxies)))

                        output = [p.get() for p in result]

                        swim.close_pool(pool, True)
                    else:
                        # Otherwise run in single threaded fashion
                        for i in range(0, len(start_date_hist)):
                            output.append(
                                self._fetch_market_data(
                                    start_date_hist[i],
                                    finish_date_hist[i],
                                    ticker,
                                    write_to_disk=write_temp_to_disk,
                                    read_cached_from_disk=read_cached_from_disk,
                                    web_proxies=web_proxies))

                    # Get all the dataframe chunks and returned messages
                    df_list = [
                        self._remove_duplicates_time_series(x,
                                                            remove_duplicates,
                                                            field='mid')
                        for x, y in output if x is not None
                    ]
                    msg_list = [
                        y for x, y in output if x is not None and y is not None
                    ]

                    # Concatenate all the 5 (or larger) minute data chunks
                    try:
                        if df_list != []:
                            df_temp = pd.concat(df_list)

                            if df_temp is not None:
                                if not (df_temp.empty):
                                    df_remote_list.append(df_temp)

                    except Exception as e:
                        logger.error(str(e))

                if df_remote_list != []:
                    df = pd.concat(df_remote_list)

                    # Need to sort data (database assumes sorted data for chunking/searches)
                    df = df.sort_index()
                    df = self.time_series_ops.localize_as_UTC(df)

                    if write_large_hdf5_parquet:
                        if df is not None:
                            if not (df.empty):
                                key =  '_' + self._get_postfix() + "_" + \
                                       (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_')
                                filename = os.path.join(
                                    csv_folder,
                                    ticker + key) + '.' + fileformat

                                # Temporary cache for testing purposes (also if the process crashes, we can read this back in)
                                UtilFunc().write_dataframe_to_binary(
                                    df, filename, format=binary_format)

            if df is not None:
                # Assume UTC time (don't want to mix UTC and non-UTC in database!)
                df = self.time_series_ops.localize_as_UTC(df)

            # write CSV
            if write_large_csv:
                if df is not None:
                    if not (df.empty):
                        key = '_' + self._get_postfix() + "_" + \
                              (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_')

                        if csv_compression is 'gzip':
                            df.to_csv(os.path.join(csv_folder,
                                                   ticker + key + ".csv.gz"),
                                      compression='gzip')
                        else:
                            df.to_csv(
                                os.path.join(csv_folder,
                                             ticker + key + ".csv"))

            if return_df:
                df_dict[ticker] = df

            # Dump what we have locally (or whatever DatabaseSource we have defined)
            try:

                start_date = start_date.replace(tzinfo=pytz.utc)

                # Remove first point if matches last point from dataset
                if has_old:
                    if df.index[0] == start_date:
                        df = df[-1:]

                if df is not None:
                    df = df.sort_index()

                    df = self._remove_duplicates_time_series(df,
                                                             remove_duplicates,
                                                             field='mid')

                if write_to_disk_db and df is not None:
                    data_source_local.append_market_data(
                        df,
                        ticker,
                        if_exists_table=if_exists_table,
                        if_exists_ticker=if_exists_ticker)

                    logger.info("Wrote to database for " + ticker)

            except Exception as e:
                final_err = "Data was missing for these dates " + str(start_date) + " - " + str(finish_date) + " for " \
                            + str(tickers) + " Didn't write anything to disk or return any valid dataframe: " + str(e)

                logger.error(final_err)

            if df is None:
                msg_list.append("No downloaded data for " + str(start_date) +
                                " - " + str(finish_date) +
                                ". Is this a holiday?")

        # Returns a status containing any failed downloads, which can be read by a user
        return msg_list, df_dict

    def _remove_duplicates_time_series(self,
                                       df,
                                       remove_duplicates,
                                       field='mid'):

        if remove_duplicates:
            df = self.time_series_ops.drop_consecutive_duplicates(df, field)

        return df

    def combine_mini_df_from_disk(self, tickers=None, remove_duplicates=True):
        """Combines the mini HDF5/Parquet files for eg. 5 min chunks and combine into a very large HDF5/Parquet file, which is likely to be
        for multiple months of data. Uses multithreading to speed up, by using a thread for each different ticker.

        Parameters
        ----------
        tickers : str (list or ditc)
            Ticker of each ticker

        remove_duplicates : bool
            Remove duplicated market prices, which follow one another

        Returns
        -------

        """

        if tickers is None: tickers = self.tickers.keys()

        if isinstance(tickers, dict): tickers = tickers.keys()

        if not (isinstance(tickers, list)):
            tickers = [tickers]

        if constants.use_multithreading:
            swim = Swim(parallel_library=constants.
                        database_populator_threading_library)
            pool = swim.create_pool(thread_no=self._get_threads())

            result = []

            for i in range(0, len(tickers)):
                result.append(
                    pool.apply_async(
                        self._combine_mini_df_from_disk_single_thread,
                        args=(
                            tickers[i],
                            remove_duplicates,
                        )))

            output = [p.get() for p in result]

            swim.close_pool(pool, True)

        else:
            for i in range(0, len(tickers)):
                self._combine_mini_df_from_disk_single_thread(
                    tickers[i], remove_duplicates)

    def _combine_mini_df_from_disk_single_thread(self,
                                                 ticker,
                                                 remove_duplicates=True):

        logger = LoggerManager.getLogger(__name__)
        time_series_ops = TimeSeriesOps()

        logger.info('Getting ' + ticker + ' filenames...')
        temp_data_folder = self.temp_data_folder

        filename_list = []

        for root, dirnames, filenames in os.walk(temp_data_folder):

            for filename in filenames:
                if ticker in filename and '.' + fileformat in filename:
                    filename_h5_parquet = os.path.join(root, filename)

                    # if filename is less than 10MB add (otherwise likely a very large aggregated file!)
                    if os.path.getsize(filename_h5_parquet) < 10 * 1024 * 1024:
                        filename_list.append(filename_h5_parquet)

        df_list = []

        util_func = UtilFunc()

        logger.info('Loading ' + ticker + ' mini dataframe into  memory')

        i = 0

        if len(filename_list) == 0:
            logger.warn("Looks like there are no files for " + ticker +
                        " in " + temp_data_folder +
                        ". Are you sure path is correct?")

        # Go through each mini file which represents a few minutes of data and append it
        for filename in filename_list:
            filesize = 0

            try:
                filesize = os.path.getsize(filename) / 1024.0
                df = util_func.read_dataframe_from_binary(filename,
                                                          format=binary_format)

                i = i + 1

                # every 100 files print reading output@
                if i % 100 == 0:
                    logger.info('Reading ' + filename + ' number ' + str(i))

                if df is not None:
                    df = df.sort_index()
                    df = self._remove_duplicates_time_series(df,
                                                             remove_duplicates,
                                                             time_series_ops,
                                                             field='mid')

                    df_list.append(df)
            except Exception as e:
                logger.warn('Failed to parse ' + filename + " of " +
                            str(filesize) + "KB")  # + str(e))

            # if i > 1000:
            #    break

        # Assume UTC time (don't want to mix UTC and non-UTC in database!)
        if df_list == []:
            logger.warn('No dataframe read for ' + ticker +
                        ', cannot combine!')

            return

        logger.info('About to combine ' + ticker +
                    ' into large dataframe to write to disk...')

        df = pd.concat(df_list)
        df = time_series_ops.localize_as_UTC(df)

        df = df.sort_index()

        df = self._remove_duplicates_time_series(df,
                                                 remove_duplicates,
                                                 time_series_ops,
                                                 field='mid')

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder,
                                ticker + postfix) + '.' + fileformat

        df = time_series_ops.localize_as_UTC(df)
        util_func.write_dataframe_to_binary(df, filename, format=binary_format)

    def write_df_to_db(self,
                       tickers=None,
                       remove_duplicates=True,
                       if_exists_table='append',
                       if_exists_ticker='replace'):
        """Loads up a large HDF5/Parquet file from disk into a pd DataFrame and then dumps locally.
        Uses multithreading to speed it up, by using a thread for each different ticker.

        Parameters
        ----------
        tickers : str (list or dict)
            List of tickers

        remove_duplicates : bool
            True (default) - removes any follow on duplicates in the dataset

        if_exists_table : str
            'append' - if database table already exists append data to it
            'replace' - remove existing database table

        if_exists_ticker : str
            'append' - if ticker already exists in the database, append to it
            'replace' - replace any data for this ticker

        Returns
        -------

        """

        if tickers is None: tickers = self.tickers.keys()

        if isinstance(tickers, dict): tickers = tickers.keys()

        if not (isinstance(tickers, list)):
            tickers = [tickers]

        if constants.use_multithreading:

            swim = Swim(parallel_library=constants.
                        database_populator_threading_library)
            pool = swim.create_pool(thread_no=self._get_threads())

            result = []

            for i in range(0, len(tickers)):
                result.append(
                    pool.apply_async(self._write_df_to_db_single_thread,
                                     args=(
                                         tickers[i],
                                         remove_duplicates,
                                         if_exists_table,
                                         if_exists_ticker,
                                     )))

            output = [p.get() for p in result]

            swim.close_pool(pool, True)
        else:
            for i in range(0, len(tickers)):
                self._write_df_to_db_single_thread(tickers[i],
                                                   remove_duplicates,
                                                   if_exists_table,
                                                   if_exists_ticker)

    def _write_df_to_db_single_thread(self,
                                      ticker,
                                      remove_duplicates=True,
                                      if_exists_table='append',
                                      if_exists_ticker='replace'):

        logger = LoggerManager.getLogger(__name__)

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder,
                                ticker + postfix) + '.' + fileformat

        logger.info("Reading " + filename)

        util_func = UtilFunc()
        time_series_ops = TimeSeriesOps()
        data_source_local = self._get_output_data_source()

        df = util_func.read_dataframe_from_binary(filename,
                                                  format=binary_format)

        if df is not None:
            df = time_series_ops.localize_as_UTC(df)

            data_source_local.append_market_data(
                df,
                ticker,
                if_exists_table=if_exists_table,
                if_exists_ticker=if_exists_ticker)
        else:
            logger.warn("Couldn't write dataframe for " + ticker +
                        " to database, appears it is empty!")

    def _remove_saturday(self):
        return True
Ejemplo n.º 2
0
class SessionManager(object):
    """Manages the caching of properties for a user's session. We use this extensively, to identify users and also to
    store variables relating to users on the server side.

    It is used for example, for keeping track of which lines have plotted, user's zoom actions, whether tcapy has already
    plotted a particular dataset etc.

    """

    def __init__(self):
        self._util_func = UtilFunc()

    # session ID management functions

    def get_session_id(self):
        """Gets the current user's session ID and generates a unique one if necessary.

        Returns
        -------
        str
        """
        if 'id' not in session:
            id = str(uuid.uuid4())

            username = self.get_username()

            if username is not None:
                username = '******' + username
            else:
                username = ''

            session['id'] = id + username
        else:
            id = session['id']

        if not isinstance(id, str):
            id = id.decode("utf-8")

        return id

    def get_username(self):
        header = flask.request.headers.get('Authorization', None)

        if not header:
            return None

        username_password = base64.b64decode(header.split('Basic ')[1])
        username_password_utf8 = username_password.decode('utf-8')
        username, password = username_password_utf8.split(':')

        return username

    def set_session_flag(self, tag, value=None):
        """Sets a value with a specific tag in the session dictionary, which is essentially unique for every user.

        Parameters
        ----------
        tag : str (dict)
            The "hash key" for our variable
        value : str
            What to set the value in our hash table

        Returns
        -------

        """
        if isinstance(tag, str):
            tag = [tag]

        if isinstance(tag, dict):
            for t in tag:
                self.set_session_flag(t, value=tag[t])

            return

        tag = self._util_func.flatten_list_of_lists(tag)

        for t in tag:
            session[t] = value

    def get_session_flag(self, tag):
        """Gets the value of a tag in the user's session

        Parameters
        ----------
        tag : str
            Tag to be fetched

        Returns
        -------
        str
        """
        if tag in session:

            if isinstance(session[tag], bool):
                return session[tag]

            return str(session[tag])

        return None

    ##### these methods are for keeping track of which lines, user zooms have been plotted for each chart in the user's
    ##### session object

    def check_lines_plotted(self, lines_to_plot, tag):
        """Checks if the lines have been plotted for a particular user, by checking the plot's tag in their user session

        Parameters
        ----------
        lines_to_plot : str (list)
            Lines to be plotted

        tag : str
            Tag of plotted lines

        Returns
        -------
        bool
        """

        if tag in session:
            lines_plotted = session[tag]

            if set(lines_to_plot) == set(lines_plotted):
                return True

        return False

    def check_relayoutData_plotted(self, relayoutData, tag):
        """Checks if the relayout data (ie. related to user's clicks, such as when they zoom in) has already been plotted.

        Parameters
        ----------
        relayoutData : dict

        tag : str
            Tag referring to a particular plot

        Returns
        -------

        """


        if tag in session:
            # relayoutDataSet = None
            # sessionTagSet = None
            #
            # if relayoutData is not None:
            #     relayoutDataSet = set(relayoutData)
            #
            # if session[tag] is not None:
            #     sessionTagSet = set(session[tag])
            # if relayoutData is None:
            #    return False

            if relayoutData == session[tag]:
                return True

        return False

    def set_lines_plotted(self, lines_to_plot, tag):
        """Sets the lines plotted for a particular chart tag in the user's session

        Parameters
        ----------
        lines_to_plot : str (list)
            Lines plotted

        tag : str
            Tag of the plot

        Returns
        -------

        """
        session[tag] = lines_to_plot

    def set_relayoutData_plotted(self, relayoutData, tag):
        """Sets the user's clicks (typically for zooming into charts) for a particular chart

        Parameters
        ----------
        relayoutData : dict
            Details a user's click on the chart

        tag : str
            Tag referring to the plot

        Returns
        -------

        """
        session[tag] = relayoutData

    def set_username(self, username):
        session['username'] = username

    ##### We identify when a user has "clicked" a button by change in the number of clicks (Dash documentation recommends
    ##### this to handle user clicks)
    def get_session_clicks(self, tag):
        """Gets the number of clicks for the tag. If doesn't exist, we automatically set the tag as 0.

        Parameters
        ----------
        tag : str
            The tag for which we want to return the number of clicks

        Returns
        -------
        Number of clicks by current user
        """

        if tag not in session:
            return 0

        return session[tag]

    def set_session_clicks(self, tag, n_clicks, old_clicks=None):
        """Sets the number of clicks in the current user's session

        Parameters
        ----------
        tag : str
            Tag to store the user's clicks under
        n_clicks : int
            Number of clicks to set
        Returns
        -------

        """

        if old_clicks is None:
            session[tag] = n_clicks
        elif old_clicks > n_clicks:
            session[tag] = n_clicks

    def check_session_tag(self, tag):
        """Checks if a tag exists in the user's session, and if so returns the value of that tag in the user's session

        Parameters
        ----------
        tag : str
            Tag to check

        Returns
        -------
        str or bool
        """
        if tag in session:
            return session[tag]

        return False

    def exists_session_tag(self, tag):
        """Does a tag exist in the current user session?

        Parameters
        ----------
        tag : str

        Returns
        -------
        bool
        """
        return tag in session

    def check_session_reset_tag(self, tag):
        """Checks if a tag is in session (if that tag exists already and is "True", then we reset it to "False"), otherwise
        return "False"

        Parameters
        ----------
        tag : str
            Tags to check

        Returns
        -------
        bool
        """
        if tag in session:
            old_tag = session[tag]

            if old_tag:
                session[tag] = False

                return True

            return False

        return False

    def create_calculated_flags(self, prefix, lst=None, lst2=None):
        """Creates a list for a combination of prefix and list elements.

        Parameters
        ----------
        prefix : str
            Prefix (typically a page name like 'detailed')

        lst : str (list)
            Tags will contain these

        lst2 : str (list)
            Tags will contain these

        Returns
        -------
        str (list)
        """

        if isinstance(prefix, list):
            prefix = self._util_func.flatten_list_of_lists(prefix)
            lst = [x + '-' + lst for x in prefix]
        elif isinstance(lst, list):
            lst = self._util_func.flatten_list_of_lists(lst)
            lst = [prefix + '-' + x for x in lst]

        if lst2 is None:
            return lst

        lst3 = []

        for i in lst2:
            for j in lst:
                lst3.append(j + '-' + i)

        return lst3
Ejemplo n.º 3
0
class TCAMarketTradeLoader(ABC):
    """TCAMarketTradeLoader provides wrapper methods to load market and trade data and also allows adds additional calculated
    fields to the trade data such as metrics (slippage, market impact etc), benchmarks (mid, VWAP, TWAP etc.) etc. as well
    as ways to process this output for display for multiple tickers. Underneath it uses TCATickerLoader, for fetching
    data/calculating metrics for individual tickers.

    Typically, TCAMarketTradeLoader will be called by an instance of TCAEngine. However, it can also be called directly, if we
    simply want to download market or trade data by itself.
    """
    def __init__(self, version=constants.tcapy_version):
        self._util_func = UtilFunc(
        )  # general utility operations (such as flatten lists)
        self._trade_order_tag = TradeOrderFilterTag(
        )  # to filter trade/orders according to the values of certain tags

        self._version = version

    def get_market_data(self, market_request):
        """Gets market data for tickers. When we ask for non-standard FX crosses, only the mid-field is
        returned (calculated as a cross rate). We do not give bid/ask quotes for calculated non-standard tickers, as these
        can difficult to estimate.

        Parameters
        ----------
        market_request : MarketRequest
            The type of market data to get

        Returns
        -------
        DataFrame
        """

        tca_ticker_loader = Mediator.get_tca_ticker_loader(
            version=self._version)

        if isinstance(market_request.ticker, list):
            if len(market_request.ticker) > 1:
                market_request_list = self._split_tca_request_into_list(
                    market_request)

                market_df_dict = {}

                for market_request_single in market_request_list:
                    market_df_dict[market_request.ticker] = \
                        tca_ticker_loader(version=self._version).get_market_data(market_request_single)

        return tca_ticker_loader.get_market_data(market_request)

    def get_trade_order_data(self, tca_request, trade_order_type):
        """Gets trade data for specified parameters (eg. start/finish dates tickers). Will also try to find trades
        when they have booked in the inverted market convention, and change the fields appropriately. For example, if
        we ask for GBPUSD trade data, it will also search for USDGBP and convert those trades in the correct convention.

        Parameters
        ----------
        tca_request : TCARequest
            What type of trade data do we want

        trade_order_type : str
            Do we want trade or order data?

        Returns
        -------
        DataFrame
        """

        tca_ticker_loader = Mediator.get_tca_ticker_loader(
            version=self._version)

        if isinstance(tca_request.ticker, list):
            if len(tca_request.ticker) > 1:
                tca_request_list = self._split_tca_request_into_list(
                    tca_request)

                trade_df_list = []

                for tca_request_single in tca_request_list:
                    trade_df_list.append(
                        tca_ticker_loader(
                            version=self._version).get_trade_order_data(
                                tca_request_single, trade_order_type))

        df_dict = tca_ticker_loader.get_trade_order_data(
            tca_request, trade_order_type)

        return df_dict

    def get_trade_order_holder(self, tca_request):
        """Gets the trades/orders in the form of a TradeOrderHolder

        Parameters
        ----------
        tca_request : TCARequest
            Parameters for the TCA computation

        Returns
        -------
        TradeOrderHolder
        """

        tca_ticker_loader = Mediator.get_tca_ticker_loader(
            version=self._version)

        if isinstance(tca_request.ticker, list):
            if len(tca_request.ticker) > 1:
                tca_request_list = self._split_tca_request_into_list(
                    tca_request)

                trade_order_holder = DataFrameHolder()

                for tca_request_single in tca_request_list:
                    trade_order_holder.add_dataframe_holder(
                        tca_ticker_loader(version=self._version).
                        get_trade_order_holder(tca_request_single))

        return tca_ticker_loader(
            version=self._version).get_trade_order_holder(tca_request)

    def get_market_trade_order_holder(self, tca_request):
        """Gets the both the market data and trade/order data associated with a TCA calculation as a tuple of
        (DataFrame, DataFrameHolder)

        Parameters
        ----------
        tca_request : TCARequest
            Parameters for a TCA calculation

        Returns
        -------
        DataFrame, DataFrameHolder
        """

        return Mediator.get_tca_ticker_loader(
            version=self._version).get_market_trade_order_holder(tca_request)

    def load_market_calculate_summarize_metrics(self,
                                                tca_request,
                                                dummy_market=False):
        """Splits up the TCA request into individual tickers. Market/trade data is loaded for each ticker, before
        conducting TCA (ie. calculating metrics, benchmarks etc.). Returns a dictionary consisting of market data and
        another dictionary of trade/order data (and any additional results associated with the TCA)

        Parameters
        ----------
        tca_request : TCARequest
            Parameters defining the TCA calculation

        dummy_market : bool, default False
            Do we return market data for future use?

        Returns
        -------
        DataFrame (dict), DataFrame (dict)
        """

        # Load market/trade data and compute metrics/benchmarks etc. per ticker
        market_df_dict, trade_order_results_df_dict, tca_request_list = \
            self.get_market_trade_metrics(tca_request, dummy_market=dummy_market)

        # If every ticker we have selected doesn't have trades (and are analysis also requires trades), can't do any TCA at all
        if len(trade_order_results_df_dict) == 0 and tca_request.trade_data_store is not None \
                and tca_request.trade_order_mapping is None:
            logger = LoggerManager.getLogger(__name__)

            err_msg = "no trade data for specified ticker(s) and time range"

            logger.error(err_msg)

            raise DataMissingException(err_msg)

        # trade_df = trade_order_results_df_dict['trade_df']
        # Now summarize those metrics across all the tickers, for easier display
        return self.summarize_metrics(market_df_dict,
                                      trade_order_results_df_dict,
                                      tca_request_list,
                                      dummy_market=dummy_market)

    def summarize_metrics(self,
                          market_df_dict,
                          trade_order_results_df_dict,
                          tca_request_list,
                          dummy_market=False):
        """Takes in precomputed metrics across one or more tickers, and summarizes them for later user display
        (should be customised for users)

        Parameters
        ----------
        tca_request_list : TCARequest (list)
            List of TCARequests (typically, each is for a different ticker)

        dummy_market : bool
            Should we output market data for later use?

        Returns
        -------
        DataFrame, DataFrame, DataFrame
        """

        # Allow user defined summary of metrics
        trade_order_results_df_dict = self._apply_summary_metrics(
            tca_request_list, trade_order_results_df_dict, market_df_dict)

        # Warning: do not ask for market data when requesting more than one ticker, could cause memory leaks!
        if (dummy_market):
            return None, trade_order_results_df_dict

        # Dictionary of market data and a dictionary of trades/orders/results of TCA analysis

        # TODO convert into strings
        return market_df_dict, trade_order_results_df_dict

    def _apply_summary_metrics(self, tca_request_list,
                               trade_order_results_df_dict, market_df_dict):
        return trade_order_results_df_dict

    def get_market_trade_metrics(self, tca_request, dummy_market=False):
        """Collects together all the market and trade data (and computes metrics) for each ticker specified in the
        TCARequest

        Parameters
        ----------
        tca_request : TCARequest
            Parameters for the TCA

        dummy_market : bool (default: False)
            Should dummy market data be returned (requires less memory)?

        Returns
        -------
        DataFrame (dict) , DataFrame (dict), TCARequest (list)
        """

        logger = LoggerManager.getLogger(__name__)

        logger.debug("Start loading trade/data/computation")

        # split up TCARequest into a list of TCA with different tickers
        tca_request_list = self._split_tca_request_into_list(tca_request)

        market_df_dict, trade_order_results_df_dict = self._get_market_trade_metrics(
            tca_request_list, dummy_market)

        logger.debug(
            "Finished loading data and calculating metrics on individual tickers"
        )

        return market_df_dict, trade_order_results_df_dict, tca_request_list

    def _get_market_trade_metrics(self, tca_request_list, dummy_market):
        """Gets the market and trade data, as well as computed metrics on them

        Parameters
        ----------
        tca_request_list : TCARequest (list)
            Requests for multiple TCARequests (eg. for different tickers)

        dummy_market : bool
            Return dummy market data?

        Returns
        -------
        DataFrame (dict), DataFrame (dict)
        """

        tca_ticker_loader = Mediator.get_tca_ticker_loader(
            version=self._version)

        market_df_dict = {}

        trade_order_holder_list = DataFrameHolder()

        for tca_request_single in tca_request_list:
            market_df, trade_order_df_dict = tca_ticker_loader.get_market_trade_order_holder(
                tca_request_single)

            market_df, trade_order_df_list, ticker, trade_order_keys = \
                tca_ticker_loader.calculate_metrics_single_ticker((market_df, trade_order_df_dict),
                                                                        tca_request_single, dummy_market)

            market_df_dict[ticker] = market_df

            trade_order_holder_list.add_dataframe_dict(
                dict(zip(trade_order_keys, trade_order_df_list)))

        # Unpack the DataFrameHolder into a dictionary (combining the lists of trade, orders etc. into single dataframes)
        # this may also decompress the trades
        trade_order_results_df_dict = trade_order_holder_list.get_combined_dataframe_dict(
        )

        return market_df_dict, trade_order_results_df_dict

    def _split_tca_request_into_list(self, tca_request):
        """Splits a TCA request by ticker.

        Parameters
        ----------
        tca_request : TCARequest
            TCA request to broken up into tickers

        Returns
        -------
        TCARequest(list)
        """

        ticker = tca_request.ticker

        if not (isinstance(ticker, list)):
            ticker = [ticker]

        tca_request_list = []

        # go through every ticker (and also split into list)
        for tick in ticker:
            tca_request_temp = TCARequest(tca_request=tca_request)
            tca_request_temp.ticker = tick

            tca_request_list.append(tca_request_temp)

        return self._util_func.flatten_list_of_lists(tca_request_list)

    @abc.abstractmethod
    def get_tca_version(self):
        pass
Ejemplo n.º 4
0
class TableResultsForm(ResultsForm):
    """Takes in trade/orders and then creates aggregated metrics which are likely to be displayed as a table. Can also
    sort by best/worst metrics, rounding numbers etc.

    """
    def __init__(self,
                 trade_order_list=None,
                 metric_name=None,
                 filter_by=['all'],
                 tag_value_combinations={},
                 keep_fields=['executed_notional', 'side'],
                 replace_text={},
                 round_figures_by=1,
                 scalar=1.0,
                 weighting_field=constants.table_weighting_field,
                 exclude_fields_from_avg=[]):
        self._trade_order_list = trade_order_list
        self._metric_name = metric_name
        self._results_summary = ResultsSummary()
        self._keep_fields = keep_fields
        self._filter_by = filter_by
        self._replace_text = replace_text
        self._round_figures_by = round_figures_by
        self._weighting_field = weighting_field
        self._scalar = scalar
        self._exclude_fields_from_avg = exclude_fields_from_avg

        self._tag_value_combinations = tag_value_combinations
        self._trade_order_filter_tag = TradeOrderFilterTag()
        self._results_form_tag = 'table'
        self._util_func = UtilFunc()
        self._time_series_ops = TimeSeriesOps()

    def aggregate_results(self,
                          trade_order_df=None,
                          market_df=None,
                          filter_by=[],
                          trade_order_name=None,
                          metric_name=None,
                          ticker=None,
                          filter_nan=True,
                          weighting_field=None,
                          tag_value_combinations={},
                          keep_fields=[],
                          remove_fields=[],
                          replace_text={},
                          round_figures_by=None,
                          scalar=None,
                          exclude_fields_from_avg=None):
        if not (self._check_calculate_results(trade_order_name)):
            return [None, None]

        if metric_name is None: metric_name = self._metric_name
        if keep_fields == []: keep_fields = self._keep_fields
        if filter_by == []: filter_by = self._filter_by
        if round_figures_by is None: round_figures_by = self._round_figures_by
        if replace_text == {}: replace_text = self._replace_text
        if weighting_field is None: weighting_field = self._weighting_field
        if tag_value_combinations == {}:
            tag_value_combinations = self._tag_value_combinations
        if scalar is None: scalar = self._scalar
        if exclude_fields_from_avg is None:
            exclude_fields_from_avg = self._exclude_fields_from_avg

        if not (isinstance(metric_name, list)): metric_name = [metric_name]
        if not (isinstance(filter_by, list)): filter_by = [filter_by]

        trade_order_df = self._trade_order_filter_tag.filter_trade_order(
            trade_order_df, tag_value_combinations=tag_value_combinations)

        results = []

        for filt in filter_by:
            for met in metric_name:
                if met not in trade_order_df.columns:
                    results.append(None)

                elif weighting_field is not None and weighting_field not in trade_order_df.columns:
                    results.append(None)

                else:
                    metric_fields_to_filter = [
                        x for x in trade_order_df.columns if met in x
                    ]

                    columns_to_keep = self._util_func.flatten_list_of_lists(
                        [keep_fields, metric_fields_to_filter])

                    results_df = trade_order_df[columns_to_keep]

                    # Apply filter
                    if 'worst' in filt:
                        ordinal = filt.split('worst_')[1]

                        results_df = results_df.sort_values(by=met,
                                                            ascending=True)

                        if ordinal != 'all':
                            results_df = results_df.head(int(ordinal))

                    elif 'best' in filt:
                        ordinal = filt.split('worst_')[1]
                        results_df = results_df.sort_values(by=met,
                                                            ascending=False)

                        if ordinal != 'all':
                            results_df = results_df.head(ordinal)

                    # Weighting field for average!
                    results_df = self._time_series_ops.weighted_average_of_each_column(
                        results_df,
                        weighting_field,
                        append=True,
                        exclude_fields_from_avg=exclude_fields_from_avg)

                    results_df = self._time_series_ops.multiply_scalar_dataframe(
                        results_df, scalar=scalar)
                    results_df = self._time_series_ops.round_dataframe(
                        results_df,
                        round_figures_by,
                        columns_to_keep=columns_to_keep)

                    results_df = self._util_func.replace_text_in_cols(
                        results_df, replace_text)

                    results.append(
                        (results_df, self._results_form_tag + '_' +
                         trade_order_name + '_' + met + '_by_' + filt))

        return results