Beispiel #1
0
def create_market_trade_data_eikon():
    """Creates a small dataset for testing purposes for market, trade and order data for EURUSD at the start of May 2017,
    which is dumped to the designated tcapy test harness folder.

    Returns
    -------

    """
    # Use database source as Arctic (or directly from Dukascopy) for market data (assume we are using market data as a source)
    tca_market = TCAMarketTradeLoaderImpl()

    util_func = UtilFunc()

    market_df = []

    for tick in ticker:
        market_request = MarketRequest(ticker=tick,
                                       data_store=data_store,
                                       start_date=start_date,
                                       finish_date=finish_date)

        market_df.append(
            tca_market.get_market_data(market_request=market_request))

    # Note: it can be very slow to write these CSV files
    market_df = pd.concat(market_df)
    market_df.to_csv(os.path.join(folder, 'small_test_market_df_eikon.csv.gz'),
                     compression='gzip')

    # Also write to disk as HDF5 file (easier to load up later)
    util_func.write_dataframe_to_binary(
        market_df, os.path.join(folder, 'small_test_market_df_eikon.gzip'))

    # Create a spot file in reverse order
    market_df.sort_index(ascending=False)\
        .to_csv(os.path.join(folder, 'small_test_market_df_reverse_eikon.csv.gz'), compression='gzip')

    # Also write to disk as Parquet file (easier to load up later)
    util_func.write_dataframe_to_binary(
        market_df,
        os.path.join(folder, 'small_test_market_df_reverse_eikon.parquet'))

    if create_trade_order_data:
        # Use the market data we just downloaded to CSV, and perturb it to generate the trade data
        data_test_creator = DataTestCreator(
            market_data_postfix=postfix,
            csv_market_data=os.path.join(folder,
                                         'small_test_market_df_eikon.csv.gz'),
            write_to_db=False)

        # Create randomised trade/order data
        trade_order = data_test_creator.create_test_trade_order(
            ticker_trades, start_date=start_date, finish_date=finish_date)

        trade_order['trade_df'].to_csv(
            os.path.join(folder, 'small_test_trade_df_eikon.csv'))
        trade_order['order_df'].to_csv(
            os.path.join(folder, 'small_test_order_df_eikon.csv'))
Beispiel #2
0
    def _fetch_market_data(self, start, finish, ticker, write_to_disk=True, read_cached_from_disk=True, web_proxies=constants.web_proxies):
        logger = LoggerManager.getLogger(__name__)

        key = (str(start) + str(finish) + ticker + '_' + self._get_postfix()).replace(":", '_')

        filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat
        util_func = UtilFunc()

        start_time_stamp = pd.Timestamp(start)
        finish_time_stamp = pd.Timestamp(finish)

        if self._remove_weekend_points():
            weekend_data = "Weekend? " + key

            weekday_point = UtilFunc().is_weekday_point(start_time_stamp, finish_time_stamp,
                                                        friday_close_nyc_hour=constants.friday_close_utc_hour,
                                                        sunday_open_utc_hour=constants.sunday_open_utc_hour)

            if not(weekday_point):
                return None, weekend_data

        df = None

        if read_cached_from_disk:
            if os.path.exists(filename):
                df = util_func.read_dataframe_from_binary(filename, format=binary_format)

                if df is not None:
                    logger.debug("Read " + filename + " from disk")

        if df is None:
            # Convert tcapy ticker into vendor ticker
            df = self._get_input_data_source().fetch_market_data(start, finish,
                                                                 ticker=self._get_tickers_vendor()[ticker], web_proxies=web_proxies)

            if df is not None:

                if write_to_disk:
                    # Write a small temporary dataframe to disk (if the process fails later, these can be picked up,
                    # without having a call the external vendor again
                    util_func.write_dataframe_to_binary(df, filename, format=binary_format)

        msg = None

        if df is None:
            msg = "No data? " + key

        return df, msg
Beispiel #3
0
    def _fetch_market_data(self,
                           start,
                           finish,
                           ticker,
                           write_to_disk=True,
                           read_cached_from_disk=True,
                           web_proxies=constants.web_proxies):
        logger = LoggerManager.getLogger(__name__)

        key = (str(start) + str(finish) + ticker + '_' +
               self._get_postfix()).replace(":", '_')

        filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat
        util_func = UtilFunc()

        start_time_stamp = pd.Timestamp(start)
        finish_time_stamp = pd.Timestamp(finish)

        if self._remove_saturday():
            weekend_data = "Saturday? " + key

            # Ignore Saturday, and don't attempt to download
            if start_time_stamp.dayofweek == 5 or finish_time_stamp.dayofweek == 5:
                return None, weekend_data

        if self._remove_weekend_points():
            weekend_data = "Weekend? " + key

            if start_time_stamp.dayofweek == 6 and start_time_stamp.hour < 20:
                return None, weekend_data

            if start_time_stamp.dayofweek == 4 and start_time_stamp.hour > 22:
                return None, weekend_data

        df = None

        if read_cached_from_disk:
            if os.path.exists(filename):
                df = util_func.read_dataframe_from_binary(filename,
                                                          format=binary_format)

                if df is not None:
                    logger.debug("Read " + filename + " from disk")

        if df is None:
            # Convert tcapy ticker into vendor ticker
            df = self._get_input_data_source().fetch_market_data(
                start,
                finish,
                ticker=self._get_tickers_vendor()[ticker],
                web_proxies=web_proxies)

            if df is not None:
                df = df.drop('ticker', axis=1)

                if write_to_disk:
                    # Write a small temporary dataframe to disk (if the process fails later, these can be picked up,
                    # without having a call the external vendor again
                    util_func.write_dataframe_to_binary(df,
                                                        filename,
                                                        format=binary_format)

        msg = None

        if df is None:
            msg = "No data? " + key

        return df, msg
Beispiel #4
0
    def _combine_mini_df_from_disk_single_thread(self,
                                                 ticker,
                                                 remove_duplicates=True):

        logger = LoggerManager.getLogger(__name__)
        time_series_ops = TimeSeriesOps()

        logger.info('Getting ' + ticker + ' filenames...')
        temp_data_folder = self.temp_data_folder

        filename_list = []

        for root, dirnames, filenames in os.walk(temp_data_folder):

            for filename in filenames:
                if ticker in filename and '.' + fileformat in filename:
                    filename_h5_parquet = os.path.join(root, filename)

                    # if filename is less than 10MB add (otherwise likely a very large aggregated file!)
                    if os.path.getsize(filename_h5_parquet) < 10 * 1024 * 1024:
                        filename_list.append(filename_h5_parquet)

        df_list = []

        util_func = UtilFunc()

        logger.info('Loading ' + ticker + ' mini dataframe into  memory')

        i = 0

        if len(filename_list) == 0:
            logger.warn("Looks like there are no files for " + ticker +
                        " in " + temp_data_folder +
                        ". Are you sure path is correct?")

        # Go through each mini file which represents a few minutes of data and append it
        for filename in filename_list:
            filesize = 0

            try:
                filesize = os.path.getsize(filename) / 1024.0
                df = util_func.read_dataframe_from_binary(filename,
                                                          format=binary_format)

                i = i + 1

                # every 100 files print reading output@
                if i % 100 == 0:
                    logger.info('Reading ' + filename + ' number ' + str(i))

                if df is not None:
                    df = df.sort_index()
                    df = self._remove_duplicates_time_series(df,
                                                             remove_duplicates,
                                                             time_series_ops,
                                                             field='mid')

                    df_list.append(df)
            except Exception as e:
                logger.warn('Failed to parse ' + filename + " of " +
                            str(filesize) + "KB")  # + str(e))

            # if i > 1000:
            #    break

        # Assume UTC time (don't want to mix UTC and non-UTC in database!)
        if df_list == []:
            logger.warn('No dataframe read for ' + ticker +
                        ', cannot combine!')

            return

        logger.info('About to combine ' + ticker +
                    ' into large dataframe to write to disk...')

        df = pd.concat(df_list)
        df = time_series_ops.localize_as_UTC(df)

        df = df.sort_index()

        df = self._remove_duplicates_time_series(df,
                                                 remove_duplicates,
                                                 time_series_ops,
                                                 field='mid')

        postfix = '-' + self._get_postfix() + '-with-duplicates'

        if remove_duplicates:
            postfix = '-' + self._get_postfix() + '-no-duplicates'

        filename = os.path.join(self.temp_large_data_folder,
                                ticker + postfix) + '.' + fileformat

        df = time_series_ops.localize_as_UTC(df)
        util_func.write_dataframe_to_binary(df, filename, format=binary_format)
from tcapy.util.loggermanager import LoggerManager
from tcapy.util.utilfunc import UtilFunc

add_vendor = 'dukascopy'

path = parquet_path = '/home/tcapyuser/csv_dump/' +  add_vendor + '/'

filenames = os.listdir(path)

util_func = UtilFunc()
logger = LoggerManager.getLogger(__name__)

for filename in filenames:
    format = filename.split('.')[-1]

    if format == 'gzip':
        format = 'parquet'
    elif format == 'h5':
        format = 'hdf5'

    logger.info('Reading to patch file ' + filename)

    df = util_func.read_dataframe_from_binary(os.path.join(path, filename), format=format)

    # Do your edits here, in this case overwriting the ticker column
    ticker = filename.split('_')[0]
    df['ticker'] = ticker

    util_func.write_dataframe_to_binary(df, os.path.join(path, filename), format=format)