Exemple #1
0
class DatabasePopulator(ABC):
    """DatabasePopulator connects from one data source (typically an external one via a DatabaseSource eg. DatabaseNCFX)
    downloads historical data from that and then dumps it locally
    """
    def __init__(self,
                 temp_data_folder=constants.temp_data_folder,
                 temp_large_data_folder=constants.temp_large_data_folder,
                 tickers=None,
                 data_store=None):

        self.temp_data_folder = temp_data_folder
        self.temp_large_data_folder = temp_large_data_folder
        self.tickers = None
        self.util_func = UtilFunc()
        self.time_series_ops = TimeSeriesOps()
        self.data_store = data_store

        logger = LoggerManager().getLogger(__name__)

        if not (os.path.isdir(self.temp_data_folder)):
            logger.warn("Temp data folder " + self.temp_data_folder +
                        " does not exist")

        if not (os.path.isdir(self.temp_large_data_folder)):
            logger.warn("Temp large data folder " + self.temp_data_folder +
                        " does not exist")

        if tickers is not None:
            self.tickers = tickers

    @abc.abstractmethod
    def _fetch_market_data(self,
                           start,
                           finish,
                           ticker,
                           web_proxies=constants.web_proxies):
        """Fetches market data in a single download for a ticker. We need to be careful not to specify chunks which are
        too large, as many external sources will have a limit on how much data we can download in one chunk.

        Parameters
        ----------
        start : datetime
            Start date/time of the download

        finish : datetime
            Finish date/time of the download

        ticker : str
            Ticker to be downloaded

        web_proxies : dict
            Addresses for web proxies

        Returns
        -------

        """
        pass

    def _get_postfix(self):
        """The postfix which represents this data source, eg. 'ncfx' for New Change FX or 'dukascopy' for Dukascopy

        Returns
        -------
        str
        """
        pass

    @abc.abstractmethod
    def _get_output_data_source(self):
        """Gets the DatabaseSource object which represents how we wish to store the market data internally

        Returns
        -------
        DatabaseSource
        """
        return

    def _remove_weekend_points(self):
        return True

    @abc.abstractmethod
    def _get_input_data_source(self):
        """Gets the DatabaseSource object which represents how we input the market data (typically, this will be from
        an external data source)

        Returns
        -------
        DatabaseSource
        """
        return

    @abc.abstractmethod
    def _get_tickers(self):
        """List of tickers that can accessedd from the external/input DatabaseSource

        Returns
        -------
        str (list)
        """
        return

    @abc.abstractmethod
    def _get_threads(self, start_data_hist, finish_date_hist):
        """How many threads to use when downloading from our external/input DatabaseSource

        Returns
        -------
        int
        """
        return

    def download_to_csv(self,
                        start_date,
                        finish_date,
                        tickers,
                        remove_duplicates=True,
                        split_size='monthly',
                        chunk_int_min=None,
                        include_partial_periods=False,
                        write_temp_to_disk=True,
                        write_large_csv=True,
                        write_large_hdf5_parquet=True,
                        csv_folder=constants.csv_folder,
                        csv_compression=None,
                        return_df=False,
                        web_proxies=constants.web_proxies):

        start_date = self.time_series_ops.date_parse(start_date)
        finish_date = self.time_series_ops.date_parse(finish_date)

        dates = self.util_func.split_date_single_list(
            start_date,
            finish_date,
            split_size=split_size,
            add_partial_period_start_finish_dates=include_partial_periods)

        df_dict = {}
        msg = []

        for i in range(0, len(dates) - 1):
            msg_list, df_dict_list = self.download_from_external_source(
                start_date=dates[i],
                finish_date=dates[i + 1],
                tickers=tickers,
                chunk_int_min=chunk_int_min,
                append_data=False,
                remove_duplicates=remove_duplicates,
                write_temp_to_disk=write_temp_to_disk,
                write_to_disk_db=False,
                write_large_csv=write_large_csv,
                write_large_hdf5_parquet=write_large_hdf5_parquet,
                csv_folder=csv_folder,
                csv_compression=csv_compression,
                return_df=return_df,
                web_proxies=web_proxies)

            if msg_list != []:
                msg.append(msg_list)

            if return_df:
                for k in df_dict_list.keys():
                    if k in df_dict.keys():
                        df_dict[k] = df_dict[k].append(df_dict_list[k])
                    else:
                        df_dict[k] = df_dict_list[k]

        return self.util_func.flatten_list_of_lists(msg), df_dict

    def download_from_external_source(self,
                                      append_data=True,
                                      remove_duplicates=True,
                                      if_exists_table='append',
                                      if_exists_ticker='append',
                                      number_of_days=30 * 7,
                                      chunk_int_min=None,
                                      start_date=None,
                                      finish_date=None,
                                      delete_cached_files=False,
                                      tickers=None,
                                      write_temp_to_disk=True,
                                      write_to_disk_db=True,
                                      read_cached_from_disk=True,
                                      write_large_csv=False,
                                      write_large_hdf5_parquet=True,
                                      csv_folder=constants.csv_folder,
                                      csv_compression=None,
                                      return_df=False,
                                      web_proxies=constants.web_proxies):
        """Downloads market data from an external source and then dumps to HDF5/Parquet files for temporary storage which is cached.
        If HDF5/Parquet cached files already exist for a time segment we read them in, saving us to make an external data call.

        Lastly, dumps it to an internal database.

        Parameters
        ----------
        append_data : bool
            True - only start collecting later data not already in database (ignoring number_of_days parameter)
            False - start collecting all data, ignoring anything stored in database

        remove_duplicates : bool
            True (default) - remove values which are repeated
            False - leave in repeated values

        if_exists_table : str
            'append' - if database table already exists append data to it
            'replace' - remove existing database table

        if_exists_ticker : str
            'append' - if ticker already exists in the database, append to it
            'replace' - replace any data for this ticker

        number_of_days : int
            Number of days to download data for

        chunk_int_min : int (None)
            Size of each download (default - specified in constants)

        Returns
        -------

        """
        # Swim()

        logger = LoggerManager.getLogger(__name__)

        if write_to_disk_db:
            data_source_local = self._get_output_data_source()

        if write_large_csv:
            if not (os.path.isdir(csv_folder)):
                logger.warn("CSV folder " + self.temp_data_folder +
                            " where we are about to write does not exist")

        # What chunk size in minutes do we want for this data provider?
        if chunk_int_min is None:
            chunk_int_min = self._get_download_chunk_min_size()

        if chunk_int_min is None:
            chunk_size_str = None
        else:
            chunk_size_str = str(chunk_int_min) + "min"

        if tickers is None:
            tickers = self._get_tickers()

        if isinstance(tickers, str):
            tickers = [tickers]

        # If there's no start or finish date, choose a default start finish data
        if start_date is None and finish_date is None:
            finish_date = datetime.datetime.utcnow()
            finish_date = datetime.datetime(finish_date.year,
                                            finish_date.month, finish_date.day,
                                            0, 0, 0, 0)

            start_date = finish_date - timedelta(days=number_of_days)  # 30*7
        else:
            start_date = self.time_series_ops.date_parse(start_date)
            finish_date = self.time_series_ops.date_parse(finish_date)

        if finish_date < start_date:
            logger.error("Download finish date is before start data!")

            return

        now = pd.Timestamp(datetime.datetime.utcnow(), tz='utc')

        # Do not allow downloading of future data!
        if finish_date > now:
            finish_date = now

        df_dict = {}

        # Loop through each ticker
        for ticker in tickers:

            has_old = False

            if delete_cached_files and write_to_disk_db:
                logger.info("Deleting all cached temp files for " + ticker)

                for name in glob.glob(self.temp_data_folder + '/*' + ticker +
                                      "*"):
                    try:
                        os.remove(name)
                    except:
                        logger.warn("Couldn't delete file " + name)

                logger.info("Finished deleting cached files for " + ticker)

            # If we have been asked to append data, load up what you can from the internal database
            # find the last point
            if append_data and if_exists_ticker == 'append' and write_to_disk_db:
                logger.info("Trying to download old data first for " + ticker)

                try:
                    df_old = data_source_local.fetch_market_data(
                        start_date,
                        finish_date,
                        ticker,
                        web_proxies=web_proxies)

                    # This will vary between tickers (in particular if we happen to add a new ticker)
                    start_date = df_old.index[-1]

                    has_old = True

                    # Remove reference - big file!
                    df_old = None

                except Exception as e:
                    logger.info("No data found for ticker " + ticker +
                                " with error: " + str(e))
            else:
                logger.info("Downloading new data for " + ticker + ".")

            # Date range may not work with timezones
            start_date = pd.Timestamp(start_date.replace(tzinfo=None))
            finish_date = pd.Timestamp(finish_date.replace(tzinfo=None))

            if finish_date - start_date < pd.Timedelta(days=1):
                start_date_list = [start_date, finish_date]
            else:
                # download from that last point to the present day
                start_date_list = pd.date_range(start_date, finish_date)

                start_date_list = [
                    pd.Timestamp(x.to_pydatetime()) for x in start_date_list
                ]

                if finish_date > start_date_list[-1]:
                    start_date_list.append(finish_date)

            df = None
            filename = os.path.join(self.temp_data_folder,
                                    ticker) + '.' + fileformat

            try:
                # df = UtilFunc().read_dataframe_from_hdf(filename)
                pass
            except:
                logger.info("Couldn't read HDF5/Parquet file for " + ticker)

            # Create downloads in x minute chunks (if we request very large chunks of data with certain data providers,
            # we could cause problems!)
            if df is None:
                df_remote_list = []

                # Loop by day (otherwise can end up with too many open files!)
                for i in range(0, len(start_date_list) - 1):

                    if chunk_size_str is not None:
                        if start_date_list[
                                i + 1] - start_date_list[i] < pd.Timedelta(
                                    minutes=chunk_int_min):
                            start_date_hist = [start_date_list[i]]
                            finish_date_hist = [start_date_list[i + 1]]
                        else:
                            start_date_hist, finish_date_hist = UtilFunc(
                            ).split_into_freq(start_date_list[i],
                                              start_date_list[i + 1],
                                              freq=chunk_size_str)
                    else:
                        start_date_hist = [start_date_list[i]]
                        finish_date_hist = [start_date_list[i + 1]]

                    # For FX and most other markets we should remove weekends (cryptocurrencies do have weekend data)
                    if self._remove_weekend_points():
                        start_date_hist, finish_date_hist = UtilFunc(
                        ).remove_weekend_points(start_date_hist,
                                                finish_date_hist)

                    output = []

                    if constants.use_multithreading:

                        # Create a multiprocess object for downloading data
                        swim = Swim(parallel_library=constants.
                                    database_populator_threading_library)
                        pool = swim.create_pool(thread_no=self._get_threads())

                        result = []

                        for i in range(0, len(start_date_hist)):
                            # output.append(self._fetch_market_data(start_date_hist[i], finish_date_hist[i], ticker))

                            result.append(
                                pool.apply_async(
                                    self._fetch_market_data,
                                    args=(start_date_hist[i],
                                          finish_date_hist[i], ticker,
                                          write_temp_to_disk,
                                          read_cached_from_disk, web_proxies)))

                        output = [p.get() for p in result]

                        swim.close_pool(pool, True)
                    else:
                        # Otherwise run in single threaded fashion
                        for i in range(0, len(start_date_hist)):
                            output.append(
                                self._fetch_market_data(
                                    start_date_hist[i],
                                    finish_date_hist[i],
                                    ticker,
                                    write_to_disk=write_temp_to_disk,
                                    read_cached_from_disk=read_cached_from_disk,
                                    web_proxies=web_proxies))

                    # Get all the dataframe chunks and returned messages
                    df_list = [
                        self._remove_duplicates_time_series(x,
                                                            remove_duplicates,
                                                            field='mid')
                        for x, y in output if x is not None
                    ]
                    msg_list = [
                        y for x, y in output if x is not None and y is not None
                    ]

                    # Concatenate all the 5 (or larger) minute data chunks
                    try:
                        if df_list != []:
                            df_temp = pd.concat(df_list)

                            if df_temp is not None:
                                if not (df_temp.empty):
                                    df_remote_list.append(df_temp)

                    except Exception as e:
                        logger.error(str(e))

                if df_remote_list != []:
                    df = pd.concat(df_remote_list)

                    # Need to sort data (database assumes sorted data for chunking/searches)
                    df = df.sort_index()
                    df = self.time_series_ops.localize_as_UTC(df)

                    if write_large_hdf5_parquet:
                        if df is not None:
                            if not (df.empty):
                                key =  '_' + self._get_postfix() + "_" + \
                                       (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_')
                                filename = os.path.join(
                                    csv_folder,
                                    ticker + key) + '.' + fileformat

                                # Temporary cache for testing purposes (also if the process crashes, we can read this back in)
                                UtilFunc().write_dataframe_to_binary(
                                    df, filename, format=binary_format)

            if df is not None:
                # Assume UTC time (don't want to mix UTC and non-UTC in database!)
                df = self.time_series_ops.localize_as_UTC(df)

            # write CSV
            if write_large_csv:
                if df is not None:
                    if not (df.empty):
                        key = '_' + self._get_postfix() + "_" + \
                              (str(df.index[0]) + str(df.index[-1])).replace(":", '_').replace(" ", '_')

                        if csv_compression is 'gzip':
                            df.to_csv(os.path.join(csv_folder,
                                                   ticker + key + ".csv.gz"),
                                      compression='gzip')
                        else:
                            df.to_csv(
                                os.path.join(csv_folder,
                                             ticker + key + ".csv"))

            if return_df:
                df_dict[ticker] = df

            # Dump what we have locally (or whatever DatabaseSource we have defined)
            try:

                start_date = start_date.replace(tzinfo=pytz.utc)

                # Remove first point if matches last point from dataset
                if has_old:
                    if df.index[0] == start_date:
                        df = df[-1:]

                if df is not None:
                    df = df.sort_index()

                    df = self._remove_duplicates_time_series(df,
                                                             remove_duplicates,
                                                             field='mid')

                if write_to_disk_db and df is not None:
                    data_source_local.append_market_data(
                        df,
                        ticker,
                        if_exists_table=if_exists_table,
                        if_exists_ticker=if_exists_ticker)

                    logger.info("Wrote to database for " + ticker)

            except Exception as e:
                final_err = "Data was missing for these dates " + str(start_date) + " - " + str(finish_date) + " for " \
                            + str(tickers) + " Didn't write anything to disk or return any valid dataframe: " + str(e)

                logger.error(final_err)

            if df is None:
                msg_list.append("No downloaded data for " + str(start_date) +
                                " - " + str(finish_date) +
                                ". Is this a holiday?")

        # Returns a status containing any failed downloads, which can be read by a user
        return msg_list, df_dict

    def _remove_duplicates_time_series(self,
                                       df,
                                       remove_duplicates,
                                       field='mid'):

        if remove_duplicates:
            df = self.time_series_ops.drop_consecutive_duplicates(df, field)

        return df

    def combine_mini_df_from_disk(self, tickers=None, remove_duplicates=True):
        """Combines the mini HDF5/Parquet files for eg. 5 min chunks and combine into a very large HDF5/Parquet file, which is likely to be
        for multiple months of data. Uses multithreading to speed up, by using a thread for each different ticker.

        Parameters
        ----------
        tickers : str (list or ditc)
            Ticker of each ticker

        remove_duplicates : bool
            Remove duplicated market prices, which follow one another

        Returns
        -------

        """

        if tickers is None: tickers = self.tickers.keys()

        if isinstance(tickers, dict): tickers = tickers.keys()

        if not (isinstance(tickers, list)):
            tickers = [tickers]

        if constants.use_multithreading:
            swim = Swim(parallel_library=constants.
                        database_populator_threading_library)
            pool = swim.create_pool(thread_no=self._get_threads())

            result = []

            for i in range(0, len(tickers)):
                result.append(
                    pool.apply_async(
                        self._combine_mini_df_from_disk_single_thread,
                        args=(
                            tickers[i],
                            remove_duplicates,
                        )))

            output = [p.get() for p in result]

            swim.close_pool(pool, True)

        else:
            for i in range(0, len(tickers)):
                self._combine_mini_df_from_disk_single_thread(
                    tickers[i], remove_duplicates)

    def _combine_mini_df_from_disk_single_thread(self,
                                                 ticker,
                                                 remove_duplicates=True):

        logger = LoggerManager.getLogger(__name__)
        time_series_ops = TimeSeriesOps()

        logger.info('Getting ' + ticker + ' filenames...')
        temp_data_folder = self.temp_data_folder

        filename_list = []

        for root, dirnames, filenames in os.walk(temp_data_folder):

            for filename in filenames:
                if ticker in filename and '.' + fileformat in filename:
                    filename_h5_parquet = os.path.join(root, filename)

                    # if filename is less than 10MB add (otherwise likely a very large aggregated file!)
                    if os.path.getsize(filename_h5_parquet) < 10 * 1024 * 1024:
                        filename_list.append(filename_h5_parquet)

        df_list = []

        util_func = UtilFunc()

        logger.info('Loading ' + ticker + ' mini dataframe into  memory')

        i = 0

        if len(filename_list) == 0:
            logger.warn("Looks like there are no files for " + ticker +
                        " in " + temp_data_folder +
                        ". Are you sure path is correct?")

        # Go through each mini file which represents a few minutes of data and append it
        for filename in filename_list:
            filesize = 0

            try:
                filesize = os.path.getsize(filename) / 1024.0
                df = util_func.read_dataframe_from_binary(filename,
                                                          format=binary_format)

                i = i + 1

                # every 100 files print reading output@
                if i % 100 == 0:
                    logger.info('Reading ' + filename + ' number ' + str(i))

                if df is not None:
                    df = df.sort_index()
                    df = self._remove_duplicates_time_series(df,
                                                             remove_duplicates,
                                                             time_series_ops,
                                                             field='mid')

                    df_list.append(df)
            except Exception as e:
                logger.warn('Failed to parse ' + filename + " of " +
                            str(filesize) + "KB")  # + str(e))

            # if i > 1000:
            #    break

        # Assume UTC time (don't want to mix UTC and non-UTC in database!)
        if df_list == []:
            logger.warn('No dataframe read for ' + ticker +
                        ', cannot combine!')

            return

        logger.info('About to combine ' + ticker +
                    ' into large dataframe to write to disk...')

        df = pd.concat(df_list)
        df = time_series_ops.localize_as_UTC(df)

        df = df.sort_index()

        df = self._remove_duplicates_time_series(df,
                                                 remove_duplicates,
                                                 time_series_ops,
                                                 field='mid')

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder,
                                ticker + postfix) + '.' + fileformat

        df = time_series_ops.localize_as_UTC(df)
        util_func.write_dataframe_to_binary(df, filename, format=binary_format)

    def write_df_to_db(self,
                       tickers=None,
                       remove_duplicates=True,
                       if_exists_table='append',
                       if_exists_ticker='replace'):
        """Loads up a large HDF5/Parquet file from disk into a pd DataFrame and then dumps locally.
        Uses multithreading to speed it up, by using a thread for each different ticker.

        Parameters
        ----------
        tickers : str (list or dict)
            List of tickers

        remove_duplicates : bool
            True (default) - removes any follow on duplicates in the dataset

        if_exists_table : str
            'append' - if database table already exists append data to it
            'replace' - remove existing database table

        if_exists_ticker : str
            'append' - if ticker already exists in the database, append to it
            'replace' - replace any data for this ticker

        Returns
        -------

        """

        if tickers is None: tickers = self.tickers.keys()

        if isinstance(tickers, dict): tickers = tickers.keys()

        if not (isinstance(tickers, list)):
            tickers = [tickers]

        if constants.use_multithreading:

            swim = Swim(parallel_library=constants.
                        database_populator_threading_library)
            pool = swim.create_pool(thread_no=self._get_threads())

            result = []

            for i in range(0, len(tickers)):
                result.append(
                    pool.apply_async(self._write_df_to_db_single_thread,
                                     args=(
                                         tickers[i],
                                         remove_duplicates,
                                         if_exists_table,
                                         if_exists_ticker,
                                     )))

            output = [p.get() for p in result]

            swim.close_pool(pool, True)
        else:
            for i in range(0, len(tickers)):
                self._write_df_to_db_single_thread(tickers[i],
                                                   remove_duplicates,
                                                   if_exists_table,
                                                   if_exists_ticker)

    def _write_df_to_db_single_thread(self,
                                      ticker,
                                      remove_duplicates=True,
                                      if_exists_table='append',
                                      if_exists_ticker='replace'):

        logger = LoggerManager.getLogger(__name__)

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder,
                                ticker + postfix) + '.' + fileformat

        logger.info("Reading " + filename)

        util_func = UtilFunc()
        time_series_ops = TimeSeriesOps()
        data_source_local = self._get_output_data_source()

        df = util_func.read_dataframe_from_binary(filename,
                                                  format=binary_format)

        if df is not None:
            df = time_series_ops.localize_as_UTC(df)

            data_source_local.append_market_data(
                df,
                ticker,
                if_exists_table=if_exists_table,
                if_exists_ticker=if_exists_ticker)
        else:
            logger.warn("Couldn't write dataframe for " + ticker +
                        " to database, appears it is empty!")

    def _remove_saturday(self):
        return True
Exemple #2
0
class DataTestCreator(object):
    """This class copies market data/trade data to our database (by default: Arctic/MongoDB for market data and
    MSSQL for trade data). It generates randomised test trades/orders based upon the market data, randomly perturbing
    the bid/ask to simulate a traded price.

    """
    def __init__(
            self,
            market_data_postfix='dukascopy',
            csv_market_data=None,
            write_to_db=True,
            market_data_source='arctic',
            sql_trade_database_type='mysql',
            market_data_database_name=constants.
        arctic_market_data_database_name,
            market_data_database_table=constants.
        arctic_market_data_database_table,
            trade_data_database_name=constants.mysql_trade_data_database_name):
        if csv_market_data is None:
            if market_data_source in ['arctic', 'kdb', 'influxdb']:
                self._market_data_source = market_data_source + '-' + market_data_postfix
            else:
                self._market_data_source = market_data_postfix
        else:
            self._market_data_source = csv_market_data

        self._tca_market = Mediator.get_tca_market_trade_loader()

        # Assumes MongoDB for tick data and MSSQL for trade/order data
        if write_to_db:

            if market_data_source == 'arctic':
                self._database_source_market = DatabaseSourceArctic(
                    postfix=market_data_postfix)  # market data source
            elif market_data_source == 'kdb':
                self._database_source_market = DatabaseSourceKDB(
                    postfix=market_data_postfix)  # market data source
            elif market_data_source == 'influxdb':
                self._database_source_market = DatabaseSourceInfluxDB(
                    postfix=market_data_postfix)  # market data source

            self._market_data_database_name = market_data_database_name
            self._market_data_database_table = market_data_database_table

            if sql_trade_database_type == 'ms_sql_server':
                self._database_source_trade = DatabaseSourceMSSQLServer(
                )  # trade data source
                self._trade_data_database_name = trade_data_database_name
            elif sql_trade_database_type == 'mysql':
                self._database_source_trade = DatabaseSourceMySQL(
                )  # trade data source
                self._trade_data_database_name = trade_data_database_name
            elif sql_trade_database_type == 'sqlite':
                self._database_source_trade = DatabaseSourceSQLite(
                )  # trade data source
                self._trade_data_database_name = trade_data_database_name

        self.time_series_ops = TimeSeriesOps()
        self.rand_time_series = RandomiseTimeSeries()

    def populate_test_database_with_csv(self,
                                        csv_market_data=None,
                                        ticker=None,
                                        csv_trade_data=None,
                                        if_exists_market_table='append',
                                        if_exists_market_ticker='replace',
                                        if_exists_trade_table='replace',
                                        market_data_postfix='dukascopy',
                                        remove_market_duplicates=False):
        """Populates both the market database and trade database with market data and trade/order data respectively, which
        have been sourced in CSV/HDF5 files.

        Parameters
        ----------
        csv_market_data : str (list)
            Path of CSV/HDF5 file with market data

        ticker : str (list)
            Ticker for market data

        csv_trade_data : dict
            Dictionary with name of trade/order and associated path of CSV/HDF5 file with trade/order data

        if_exists_market_table : str
            'replace' - deletes whole market data table
            'append' (default) - adds to existing market data

        if_exists_market_ticker : str
            'replace' (default) - deletes existing data for the ticker
            'append' - appends data for this this

        if_exists_trade_table : str
            'replace' - deletes data in trade table, before writing

        market_data_postfix : str (default 'dukascopy')
            data source for market data (typically broker or venue name)

        remove_market_duplicates : bool (default: False)
            Should we remove any duplicated values in market data (for TCA purposes, we can usually remove duplicated values
            However, we need to be careful when using richer market data (eg. with volume data), where consecutive prices
            might be the same but have different volume/other fields

        Returns
        -------

        """

        logger = LoggerManager.getLogger(__name__)

        # Populate the market data (eg. spot data)
        if csv_market_data is not None:
            self._database_source_market.set_postfix(market_data_postfix)
            logger.info('Writing market data to database')
            self._database_source_market.convert_csv_to_table(
                csv_market_data,
                ticker,
                self._market_data_database_table,
                database_name=self._market_data_database_name,
                if_exists_table=if_exists_market_table,
                if_exists_ticker=if_exists_market_ticker,
                remove_duplicates=remove_market_duplicates)

        # Populate the test trade/order data (which will have been randomly generated)
        if csv_trade_data is not None:
            logger.info('Writing trade data to database')

            # Allow for writing of trades + orders each to a different database table
            if isinstance(csv_trade_data, dict):
                for key in csv_trade_data.keys():
                    # csv file name, trade/order name is key (eg. trade_df)
                    self._database_source_trade.convert_csv_to_table(
                        csv_trade_data[key],
                        None,
                        key,
                        database_name=self._trade_data_database_name,
                        if_exists_table=if_exists_trade_table)

            # Otherwise simply assume we are writing trade data
            else:
                logger.error("Specify trade/orders hierarchy")

        logger.info('Completed writing data to database')

    def create_test_trade_order(self,
                                ticker,
                                start_date='01 Jan 2016',
                                finish_date='01 May 2018',
                                order_min_size=0.5 * constants.MILLION,
                                order_max_size=20.0 * constants.MILLION,
                                number_of_orders_min_per_year=252 * 20,
                                number_of_orders_max_per_year=252 * 200):
        """Create a randomised list of orders & trade using indicative market data as a source (and perturbing the
        execution prices, within various constraints, such as the approximate size of orders trades, the orders per _year

        Parameters
        ----------
        ticker : str
            Ticker

        start_date : str
            Start date of the orders

        finish_date : str
            Finish date of the orders

        order_min_size : float
            Minimum size of orders

        order_max_size : float
            Maximum size of orders

        number_of_orders_min_per_year : int
            Minimum orders per _year

        number_of_orders_max_per_year : int
            Maximum orders per _year

        Returns
        -------
        DataFrame
        """
        logger = LoggerManager.getLogger(__name__)

        if isinstance(ticker, str):
            ticker = [ticker]

        order_list = []
        trade_list = []

        start_date = self.time_series_ops.date_parse(start_date,
                                                     assume_utc=True)
        finish_date = self.time_series_ops.date_parse(finish_date,
                                                      assume_utc=True)
        util_func = UtilFunc()

        # Make this parallel? but may have memory issues
        for tick in ticker:

            logger.info("Loading market data for " + tick)

            # split into yearly chunks (otherwise can run out of memory easily)
            date_list = util_func.split_date_single_list(
                start_date,
                finish_date,
                split_size='yearly',
                add_partial_period_start_finish_dates=True)

            # TODO do in a batch fashion
            for i in range(0, len(date_list) - 1):
                df = self._tca_market.get_market_data(
                    MarketRequest(start_date=date_list[i],
                                  finish_date=date_list[i + 1],
                                  ticker=tick,
                                  data_store=self._market_data_source))

                # self.database_source_market.fetch_market_data(start_date = start_date, finish_date = finish_date, ticker = tick)

                # Need to make sure there's sufficient market data!
                if df is not None:
                    if len(df.index) >= 2:
                        # Get the percentage of the _year represented by the difference between the start and finish dates
                        year_perc = float(
                            (df.index[-1] - df.index[0]).seconds /
                            (24.0 * 60.0 * 60.0)) / 365.0

                        logger.info("Constructing randomised trades for " +
                                    tick)

                        number_of_orders_min = int(
                            year_perc * number_of_orders_min_per_year)
                        number_of_orders_max = int(
                            year_perc * number_of_orders_max_per_year)

                        # Split up the data frame into equally sized chunks
                        df_orders = self._derive_order_no(
                            self._strip_columns(df, tick),
                            number_of_orders_min, number_of_orders_max)

                        # Don't want a memory leak, so delete this as soon possible from memory!
                        del df

                        # order_counter = 0

                        logger.info("Now beginning order construction for " +
                                    tick)

                        # For each order create randomised associated trades
                        # group together all the trades per day as orders
                        for df_order in df_orders:

                            # Set duration of the grandparent order (find randomised start/finish time)
                            # somewhere between 0-25% for start, and 75% to 100% for end point
                            df_order = self.rand_time_series.randomly_truncate_data_frame_within_bounds(
                                df_order, start_perc=0.25, finish_perc=0.75)

                            logger.debug("Creating order between " +
                                         str(df_order.index[0]) + " - " +
                                         str(df_order.index[-1]))

                            # Assume all orders/trades are in the same direction (which is randomly chosen)
                            buy_sell = randint(0, 1)

                            # Sell trades
                            if buy_sell == 0:
                                side_no = -1
                                side = 'bid'

                            # Buy trades
                            else:
                                side_no = 1
                                side = 'ask'

                            magnitude = 10000.0 * 2

                            if tick == 'USDJPY': magnitude = 100.0 * 2.0

                            if randint(0, 100) > 97:
                                new_tick = tick[3:6] + tick[0:3]

                                if 'ticker' in df_order.columns:
                                    df_order['ticker'] = new_tick

                                if 'bid' in df_order.columns and 'ask' in df_order.columns:
                                    ask = 1.0 / df_order['bid']
                                    bid = 1.0 / df_order['ask']

                                    df_order['bid'] = bid
                                    df_order['ask'] = ask

                                df_order['mid'] = 1.0 / df_order['mid']
                            else:
                                new_tick = tick

                            # Get 'bid' for sells, and 'ask' for buys
                            df_order['trade_value'] = df_order[side]

                            # We want to simulate the executions by perturbing the buys randomly
                            df_order = self.rand_time_series.randomly_perturb_column(
                                df_order,
                                column='trade_value',
                                magnitude=magnitude)

                            # Assume notional is in base currency in vast majority of cases
                            if randint(0, 100) > 97:
                                notional_currency = new_tick[3:6]
                            else:
                                notional_currency = new_tick[0:3]

                            notional_multiplier = 1.0

                            if notional_currency == 'JPY':
                                notional_multiplier = 100.0

                            # Randomly choose a realistic order notional
                            # This will later be subdivided into trade notional
                            order_notional = randint(
                                order_min_size * notional_multiplier,
                                order_max_size * notional_multiplier)

                            order_additional_attributes = {
                                'broker_id':
                                constants.test_brokers_dictionary['All'],
                                'broker_sub_id':
                                constants.test_sub_brokers_dictionary['All'],
                                'algo_id':
                                constants.test_algos_dictionary['All'],
                                'algo_settings':
                                'default',
                            }

                            # Construct an order and add it to list
                            ind_order = self._construct_order(
                                df_order,
                                order_type='order',
                                notional=order_notional,
                                notional_currency=notional_currency,
                                side=side_no,
                                tick=new_tick,
                                additional_attributes=
                                order_additional_attributes)

                            order_list.append(ind_order)

                            trade_additional_attributes = self.grab_attributes_from_trade_order(
                                ind_order, [
                                    'broker_id', 'broker_sub_id', 'algo_id',
                                    'algo_settings'
                                ])

                            # Now create all the broker messages for the order

                            # These will consist firstly of placement messages
                            # then potentionally cancels, cancel/replace and in most cases we randomly assign trade fills
                            trade_list = self._create_trades_from_order(
                                trade_list=trade_list,
                                df_order=df_order,
                                tick=new_tick,
                                ind_order=ind_order,
                                side_no=side_no,
                                order_notional=order_notional,
                                notional_currency=notional_currency,
                                additional_attributes=
                                trade_additional_attributes)

                            # order_counter = order_counter + 1

        # Aggregate all the lists into DataFrames (setting 'date' as the index)

        # For the trade dataframe also drop the 'index' column which was previous used to ensure that fills, were after placements
        trade_order_dict = {
            'trade_df':
            self.time_series_ops.aggregate_dict_to_dataframe(
                trade_list, 'date', 'index'),
            'order_df':
            self.time_series_ops.aggregate_dict_to_dataframe(
                order_list, 'date')
        }

        return trade_order_dict

    def _create_trades_from_order(self,
                                  trade_list=None,
                                  df_order=None,
                                  tick=None,
                                  ind_order=None,
                                  side_no=None,
                                  order_notional=None,
                                  notional_currency=None,
                                  additional_attributes=None):

        trade_notional = order_notional

        # Assume placement at start of order (a placement will have the order notional)
        placement_event = self.construct_trade(
            df_order,
            order_notional=order_notional,
            execution_venue=constants.test_venues_dictionary['All'],
            order=ind_order,
            side=side_no,
            tick=tick,
            event_type='placement',
            notional_currency=notional_currency,
            additional_attributes=additional_attributes)

        trade_list.append(placement_event)

        # Randomly choose an event (cancel/replace + fill, cancel or fill)
        i = randint(0, 1000)

        # Very rare event, same timestamp for a trade, same size too (but different ID)
        if i < 1:

            # executed trade
            fill_event = self.construct_trade(
                df_order,
                order=ind_order,
                order_notional=order_notional,
                execution_venue=placement_event['venue'],
                notional_currency=notional_currency,
                executed_notional=int(float(trade_notional) * 0.5),
                side=side_no,
                tick=tick,
                event_type='trade',
                index=min(len(df_order.index), 5),
                additional_attributes=additional_attributes)

            trade_list.append(fill_event)

            fill_event = self.construct_trade(
                df_order.copy(),
                order=ind_order,
                order_notional=order_notional,
                execution_venue=placement_event['venue'],
                notional_currency=notional_currency,
                executed_notional=int(float(trade_notional) * 0.5),
                side=side_no,
                tick=tick,
                event_type='trade',
                index=min(len(df_order.index), 5),
                additional_attributes=additional_attributes)

            trade_list.append(fill_event)
        elif i < 50:
            # Cancel/replace event
            cancel_replace_index = randint(1, min(len(df_order.index), 20))

            cancel_replace_event = self.construct_trade(
                df_order,
                order=ind_order,
                execution_venue=placement_event['venue'],
                notional_currency=notional_currency,
                side=side_no,
                tick=tick,
                event_type='cancel/replace',
                index=cancel_replace_index,
                additional_attributes=additional_attributes)

            trade_list.append(cancel_replace_event)

            fill_event_index = randint(cancel_replace_index + 1,
                                       min(len(df_order.index), 50))

            # Executed fill event
            fill_event = self.construct_trade(
                df_order,
                order=ind_order,
                order_notional=order_notional,
                execution_venue=placement_event['venue'],
                executed_notional=trade_notional,
                notional_currency=notional_currency,
                side=side_no,
                tick=tick,
                event_type='trade',
                index=fill_event_index,
                additional_attributes=additional_attributes)

            trade_list.append(fill_event)

        # Rare event, full cancellation of order
        elif i < 60:
            cancel_index = randint(1, min(len(df_order.index), 20))

            cancel_event = self.construct_trade(
                df_order,
                order=ind_order,
                execution_venue=placement_event['venue'],
                executed_notional=0,
                notional_currency=notional_currency,
                side=side_no,
                tick=tick,
                event_type='cancel',
                index=cancel_index,
                additional_attributes=additional_attributes)

            trade_list.append(cancel_event)

        elif i < 80:
            # Where we have two trade fills for a single child order of different sizes
            perc = float(randint(5, 95)) / 100.0

            # executed trade
            fill_event = self.construct_trade(
                df_order,
                order=ind_order,
                execution_venue=placement_event['venue'],
                notional_currency=notional_currency,
                executed_notional=int(float(trade_notional) * perc),
                side=side_no,
                tick=tick,
                event_type='trade',
                index=randint(1, min(len(df_order.index), 50)),
                additional_attributes=additional_attributes)

            trade_list.append(fill_event)

            fill_event = self.construct_trade(
                df_order,
                order=ind_order,
                execution_venue=placement_event['venue'],
                notional_currency=notional_currency,
                executed_notional=int(float(trade_notional) * (1.0 - perc)),
                side=side_no,
                tick=tick,
                event_type='trade',
                index=randint(fill_event['index'], min(len(df_order.index),
                                                       100)),
                additional_attributes=additional_attributes)

            trade_list.append(fill_event)

        # Most common event, single trade/fill
        else:
            # Executed trade
            fill_event = self.construct_trade(
                df_order,
                order=ind_order,
                order_notional=order_notional,
                execution_venue=placement_event['venue'],
                notional_currency=notional_currency,
                executed_notional=trade_notional,
                side=side_no,
                tick=tick,
                event_type='trade',
                index=randint(1, min(len(df_order.index), 50)),
                additional_attributes=additional_attributes)

            trade_list.append(fill_event)

        return trade_list

    def _derive_order_no(self, df, orders_min, orders_max):
        df_chunks_list = self.time_series_ops.split_array_chunks(
            df, chunks=randint(orders_min, orders_max))

        if isinstance(df_chunks_list, pd.DataFrame):
            return [df_chunks_list]

        return df_chunks_list

    def _create_unique_trade_id(self, order_type, ticker, datetime_input):
        return order_type + "_" + ticker + str(datetime_input) + "_" + str(
            datetime.datetime.utcnow()) + '_' + str(randint(0, 100000))

    def _construct_order(self,
                         df,
                         order_type=None,
                         notional=None,
                         notional_currency=None,
                         side=None,
                         tick=None,
                         additional_attributes=None,
                         **kwargs):

        order = {}

        # For internal purposes
        order['ticker'] = tick
        order['notional'] = notional

        order['notional_currency'] = notional_currency

        order['side'] = side

        order['date'] = df.index[0]
        order['benchmark_date_start'] = df.index[0]
        order['benchmark_date_end'] = df.index[-1]

        order['price_limit'] = df['mid'][0]
        order['arrival_price'] = df['mid'][0]

        order['portfolio_id'] = self.add_random_sample(
            constants.test_portfolios_dictionary['All'])
        order['portfolio_manager_id'] = self.add_random_sample(
            constants.test_portfolio_managers_dictionary['All'])
        order['trader_id'] = self.add_random_sample(
            constants.test_traders_dictionary['All'])
        order['account_id'] = self.add_random_sample(
            constants.test_accounts_dictionary['All'])

        order['id'] = self._create_unique_trade_id(order_type, tick,
                                                   order['date'])

        kwargs['order'] = order

        order = self.additional_order_processing(**kwargs)

        # Add additional randomized attributes
        if additional_attributes is not None:

            # Merge list of additional attributes
            if isinstance(additional_attributes, list):
                result_dict = {}
                for d in additional_attributes:
                    result_dict.update(d)

                additional_attributes = result_dict

            for k in additional_attributes.keys():

                additional = additional_attributes[k]

                if isinstance(additional, list):
                    additional = self.add_random_sample(additional)

                order[k] = additional

        return order

    def additional_order_processing(self, **kwargs):

        return kwargs['order']

    def construct_trade(self,
                        df,
                        order_notional=None,
                        executed_notional=None,
                        notional_currency=None,
                        execution_venue=None,
                        side=None,
                        order=None,
                        tick=None,
                        event_type=None,
                        additional_attributes=None,
                        index=0):

        trade = {}

        if order_notional is None:
            order_notional = 0

        trade['order_notional'] = order_notional
        trade['notional_currency'] = notional_currency
        trade['ticker'] = tick
        trade['side'] = side
        trade['index'] = index
        trade['date'] = df.index[index]
        trade['market_bid'] = df['bid'][index]
        trade['market_ask'] = df['ask'][index]
        trade['market_mid'] = df['mid'][index]
        trade['price_limit'] = df['mid'][index]

        trade['event_type'] = event_type

        trade['executed_price'] = 0
        trade['venue'] = execution_venue
        trade['executed_notional'] = 0

        if event_type == 'trade':
            trade['executed_notional'] = executed_notional

            try:
                if np.isnan(trade['executed_notional']):
                    pass

            except:
                print('w')

            trade['executed_price'] = df['trade_value'][index]

        trade['venue'] = self.add_random_sample(
            constants.test_venues_dictionary['All'])

        if order is not None:
            trade[constants.order_name + '_pointer_id'] = order['id']
            trade['price_limit'] = order['price_limit']

            trade['portfolio_id'] = order['portfolio_id']
            trade['portfolio_manager_id'] = order['portfolio_manager_id']
            trade['trader_id'] = order['trader_id']
            trade['account_id'] = order['account_id']

        trade['id'] = self._create_unique_trade_id('execution', tick,
                                                   trade['date'])

        if additional_attributes is not None:
            for k in additional_attributes.keys():
                trade[k] = additional_attributes[k]

        return trade

    def add_random_sample(self, lst):

        return lst[randint(0, len(lst) - 1)]

    def grab_attributes_from_trade_order(self, trade_order, attributes):

        dict = {}

        for a in attributes:
            dict[a] = trade_order[a]

        return dict

    def _strip_columns(self, df, tick):

        # filter market data so only includes specific asset (Arctic won't have this) and during "main" FX hours
        # exclude any Saturday data
        if 'ticker' in df.columns:
            df = df[(df.index.hour >= 6) & (df.index.hour < 21) &
                    (df.index.dayofweek != 5) & (df['ticker'] == tick)]
        else:
            df = df[(df.index.hour >= 6) & (df.index.hour < 21) &
                    (df.index.dayofweek != 5)]

        keep_cols = ['bid', 'ask', 'mid']

        remove_cols = []

        for k in df.columns:
            if k not in keep_cols:
                remove_cols.append(k)

        if remove_cols != []:
            df.drop(remove_cols, inplace=True, axis=1)

        # Ensure that the market is data is properly sorted
        df.sort_index(inplace=True)

        # Calculate mid price (if it doesn't exist)
        if 'mid' not in df.columns:
            df['mid'] = (df['bid'].values + df['ask'].values) / 2.0

        # Create synthetic bid/ask if they don't exist
        if 'bid' not in df.columns:
            df['bid'] = 0.9995 * df['mid'].values

        if 'ask' not in df.columns:
            df['ask'] = 1.0005 * df['mid'].values

        # First strip away out of hours times
        # remove any trades before 6am and after 9pm GMT
        return df