コード例 #1
0
    def force_type_conversion(self, data_frame):
        constants = DataConstants()

        logger = LoggerManager().getLogger(__name__)

        if data_frame is not None:
            if not (data_frame.empty):
                # Need to convert numerical and datetime columns separately
                # post pandas 0.23
                for c in data_frame.columns:
                    is_date = False

                    # Special case for ECO_RELEASE_DT / FIRST_REVISION_DATE
                    if 'ECO_RELEASE_DT' in c or 'FIRST_REVISION_DATE' in c:
                        try:
                            temp_col = []  # data_frame[c].values

                            for i in range(0, len(data_frame[c].values)):
                                try:
                                    temp_col.append(
                                        pd.to_datetime(str(
                                            int(data_frame[c].values[i])),
                                                       format='%Y%m%d'))
                                except:
                                    temp_col.append(np.datetime64('NaT'))

                            data_frame[c] = temp_col
                        except Exception as e:
                            logger.warning(
                                "Couldn't convert " + str(c) +
                                " to date.. was this column empty? " + str(e))

                    else:
                        # Only convert those Bloomberg reference fields to
                        # dates which have been listed explicitly
                        for d in constants.always_date_columns:
                            if d in c:
                                try:
                                    data_frame[c] = pd.to_datetime(
                                        data_frame[c], errors='coerce')

                                    is_date = True
                                    break
                                except:
                                    pass

                        # Otherwise this is not a date field so attempt to
                        # convert into numbers
                        if not (is_date):
                            try:
                                data_frame[c] = pd.to_numeric(data_frame[c],
                                                              errors='ignore')
                            except:
                                pass

        logger.debug("Returning converted dataframe...")

        return data_frame
コード例 #2
0
    def get_reference_data(self, md_request_vendor, md_request):
        logger = LoggerManager().getLogger(__name__)

        constants = DataConstants()

        end = datetime.utcnow()

        from datetime import timedelta
        end = end + timedelta(
            days=365)  # because very often we may with to download data about
        # future calendar events
        #  end.replace(year = end.year + 1)

        md_request_vendor.finish_date = end

        logger.debug("Requesting ref for " + md_request_vendor.tickers[0] +
                     " etc.")

        data_frame = self.download_ref(md_request_vendor)

        logger.debug("Waiting for ref...")

        # Convert from vendor to findatapy tickers/fields
        if data_frame is not None:
            if data_frame.empty:
                return None

            returned_fields = data_frame.columns.get_level_values(0)
            returned_tickers = data_frame.columns.get_level_values(1)

        if data_frame is not None:
            # TODO if empty try downloading again a year later
            fields = self.translate_from_vendor_field(returned_fields,
                                                      md_request)
            tickers = self.translate_from_vendor_ticker(
                returned_tickers, md_request)

            ticker_combined = []

            for i in range(0, len(fields)):
                ticker_combined.append(tickers[i] + "." + fields[i])

            data_frame.columns = ticker_combined

            # Need to convert numerical and datetime columns separately post
            # pandas 0.23
            data_frame = self.force_type_conversion(data_frame)

            # data_frame = data_frame.apply(pd.to_datetime, errors='ignore')
            # data_frame = data_frame.apply(pd.to_numeric, errors='ignore')

            # TODO coerce will be deprecated from pandas 0.23.0 onwards) so
            #  remove!
            # data_frame = data_frame.convert_objects(convert_dates = 'coerce',
            # convert_numeric= 'coerce')

        return data_frame
コード例 #3
0
class DataVendorBBG(DataVendor):
    """Abstract class for download of Bloomberg daily, intraday data and reference data.

    Implemented by:
        DataVendorBBGOpen - Adapted version of new Bloomberg Open API for Python which is recommended. Note that this
        requires compilation, via installed C++ compiler. For Python 3.5, this is Microsoft Visual Studio 2015.

        Or it is easier to install blpapi via conda

        Note: no longer supports COM API, which is slower and only 32 bit

    """

    # these fields are BDS style fields to be downloaded using Bloomberg's Reference Data interface
    list_of_ref_fields = [
        'release-date-time-full', 'last-tradeable-day',
        'futures-chain-tickers', 'futures-chain-last-trade-dates',
        'first-notice-date', 'first-tradeable-day', 'cal-non-settle-dates'
    ]

    list_of_ref_vendor_fields = [
        'ECO_FUTURE_RELEASE_DATE_LIST', 'LAST_TRADEABLE_DT', 'FUT_CHAIN',
        'FUT_CHAIN_LAST_TRADE_DATES', 'FUT_NOTICE_FIRST', 'FUT_FIRST_TRADE_DT',
        'CALENDAR_NON_SETTLEMENT_DATES'
    ]

    def __init__(self):
        super(DataVendorBBG, self).__init__()

        self.logger = LoggerManager().getLogger(__name__)

    # implement method in abstract superclass
    def load_ticker(self, market_data_request):
        """Retrieves market data from external data source (in this case Bloomberg)

        Parameters
        ----------
        market_data_request : MarketDataRequest
            contains all the various parameters detailing time series start and finish, tickers etc

        Returns
        -------
        DataFrame
        """
        market_data_request = MarketDataRequest(md_request=market_data_request)
        market_data_request_vendor = self.construct_vendor_market_data_request(
            market_data_request)

        data_frame = None
        self.logger.info("Request Bloomberg data")

        # do we need daily or intraday data?
        if (market_data_request.freq
                in ['daily', 'weekly', 'monthly', 'quarterly', 'yearly']):

            # work out the fields which need to be downloaded via Bloomberg ref request (BDP) and
            # those that can be downloaded via Historical request (BDH)
            ref_fields = []
            ref_vendor_fields = []

            for i in range(0, len(market_data_request.fields)):
                if market_data_request.fields[i] in self.list_of_ref_fields \
                        or market_data_request_vendor.fields[i] in self.list_of_ref_vendor_fields:
                    ref_fields.append(market_data_request.fields[i])
                    ref_vendor_fields.append(
                        market_data_request_vendor.fields[i])

            non_ref_fields = []
            non_ref_vendor_fields = []

            for i in range(0, len(market_data_request.fields)):
                if market_data_request.fields[i] not in self.list_of_ref_fields \
                        and market_data_request_vendor.fields[i] not in self.list_of_ref_vendor_fields:
                    non_ref_fields.append(market_data_request.fields[i])
                    non_ref_vendor_fields.append(
                        market_data_request_vendor.fields[i])

            # for certain cases, need to use ReferenceDataRequest
            # eg. for events times/dates, last tradeable date fields (when specified)
            if len(ref_fields) > 0:

                # careful: make sure you copy the market data request object (when threading, altering that can
                # cause concurrency issues!)
                old_fields = copy.deepcopy(market_data_request.fields)
                old_vendor_fields = copy.deepcopy(
                    market_data_request_vendor.fields)

                # market_data_request = MarketDataRequest(md_request=market_data_request_copy)

                market_data_request.fields = ref_fields
                market_data_request.vendor_fields = ref_vendor_fields
                market_data_request_vendor = self.construct_vendor_market_data_request(
                    market_data_request)

                # just select those reference fields to download via reference
                datetime_data_frame = self.get_reference_data(
                    market_data_request_vendor, market_data_request)

                # download all the other event or non-ref fields (uses HistoricalDataRequest to Bloomberg)
                # concatenate with date time fields
                if len(non_ref_fields) > 0:

                    market_data_request.fields = non_ref_fields
                    market_data_request.vendor_fields = non_ref_vendor_fields
                    market_data_request_vendor = self.construct_vendor_market_data_request(
                        market_data_request)

                    events_data_frame = self.get_daily_data(
                        market_data_request, market_data_request_vendor)

                    col = events_data_frame.index.name
                    events_data_frame = events_data_frame.reset_index(
                        drop=False)

                    data_frame = pandas.concat(
                        [events_data_frame, datetime_data_frame], axis=1)
                    temp = data_frame[col]
                    del data_frame[col]
                    data_frame.index = temp
                else:
                    data_frame = datetime_data_frame

                market_data_request.fields = copy.deepcopy(old_fields)
                market_data_request_vendor.fields = copy.deepcopy(
                    old_vendor_fields)

            # for all other daily/monthly/quarter data, we can use HistoricalDataRequest to Bloomberg
            else:
                data_frame = self.get_daily_data(market_data_request,
                                                 market_data_request_vendor)

                try:
                    # convert fields with release-dt to dates (special case!)
                    for c in data_frame.columns:
                        if 'release-dt' in c:
                            data_frame[c] = (
                                data_frame[c]).astype('int').astype(str).apply(
                                    lambda x: pandas.to_datetime(
                                        x, format='%Y%m%d'))
                except:
                    pass

        # assume one ticker only for intraday data and use IntradayDataRequest to Bloomberg
        if (market_data_request.freq
                in ['tick', 'intraday', 'second', 'minute', 'hourly']):
            market_data_request_vendor.tickers = market_data_request_vendor.tickers[
                0]

            if market_data_request.freq in ['tick', 'second']:
                data_frame = self.download_tick(market_data_request_vendor)
            else:
                data_frame = self.download_intraday(market_data_request_vendor)

            if data_frame is not None:
                if data_frame.empty:
                    try:
                        self.logger.info("No tickers returned for: " +
                                         market_data_request_vendor.tickers)
                    except:
                        pass

                    return None

                cols = data_frame.columns.values

                import pytz

                try:
                    data_frame = data_frame.tz_localize(pytz.utc)
                except:
                    data_frame = data_frame.tz_convert(pytz.utc)

                cols = market_data_request.tickers[0] + "." + cols
                data_frame.columns = cols

        self.logger.info("Completed request from Bloomberg.")

        return data_frame

    def get_daily_data(self, market_data_request, market_data_request_vendor):
        data_frame = self.download_daily(market_data_request_vendor)

        # convert from vendor to findatapy tickers/fields
        if data_frame is not None:
            if data_frame.empty:
                self.logger.info("No tickers returned for...")

                try:
                    self.logger.info(str(market_data_request_vendor.tickers))
                except:
                    pass

                return None

            returned_fields = data_frame.columns.get_level_values(0)
            returned_tickers = data_frame.columns.get_level_values(1)

            # TODO if empty try downloading again a year later
            try:
                fields = self.translate_from_vendor_field(
                    returned_fields, market_data_request)
            except:
                print('t')

            tickers = self.translate_from_vendor_ticker(
                returned_tickers, market_data_request)

            ticker_combined = []

            for i in range(0, len(fields)):
                ticker_combined.append(tickers[i] + "." + fields[i])

            data_frame.columns = ticker_combined
            data_frame.index.name = 'Date'

        return data_frame

    def get_reference_data(self, market_data_request_vendor,
                           market_data_request):
        end = datetime.utcnow()

        from datetime import timedelta
        end = end + timedelta(
            days=365
        )  # because very often we may with to download data about future calendar events
        #  end.replace(year = end.year + 1)

        market_data_request_vendor.finish_date = end

        self.logger.debug("Requesting ref for " +
                          market_data_request_vendor.tickers[0] + " etc.")

        data_frame = self.download_ref(market_data_request_vendor)

        self.logger.debug("Waiting for ref...")

        # convert from vendor to findatapy tickers/fields
        if data_frame is not None:
            if data_frame.empty:
                return None

            returned_fields = data_frame.columns.get_level_values(0)
            returned_tickers = data_frame.columns.get_level_values(1)

        if data_frame is not None:
            # TODO if empty try downloading again a year later
            fields = self.translate_from_vendor_field(returned_fields,
                                                      market_data_request)
            tickers = self.translate_from_vendor_ticker(
                returned_tickers, market_data_request)

            ticker_combined = []

            for i in range(0, len(fields)):
                ticker_combined.append(tickers[i] + "." + fields[i])

            data_frame.columns = ticker_combined

            # need to convert numerical and datetime columns separately post pandas 0.23
            data_frame = data_frame.apply(pandas.to_numeric, errors='ignore')
            data_frame = data_frame.apply(pandas.to_datetime, errors='ignore')

            # TODO coerce will be deprecated from pandas 0.23.0 onwards) so remove!
            # data_frame = data_frame.convert_objects(convert_dates = 'coerce', convert_numeric= 'coerce')

        return data_frame

    # implement method in abstract superclass
    @abc.abstractmethod
    def kill_session(self):
        return

    @abc.abstractmethod
    def download_tick(self, market_data_request):
        return

    @abc.abstractmethod
    def download_intraday(self, market_data_request):
        return

    @abc.abstractmethod
    def download_daily(self, market_data_request):
        return

    @abc.abstractmethod
    def download_ref(self, market_data_request):
        return
コード例 #4
0
class DataVendorBBG(DataVendor):

    def __init__(self):
        super(DataVendorBBG, self).__init__()
        self.logger = LoggerManager().getLogger(__name__)

    # implement method in abstract superclass
    def load_ticker(self, market_data_request):
        """
        load_ticker - Retrieves market data from external data source (in this case Bloomberg)

        Parameters
        ----------
        market_data_request : MarketDataRequest
            contains all the various parameters detailing time series start and finish, tickers etc

        Returns
        -------
        DataFrame
        """
        market_data_request_vendor = self.construct_vendor_market_data_request(market_data_request)

        data_frame = None
        self.logger.info("Request Bloomberg data")

        # do we need daily or intraday data?
        if (market_data_request.freq in ['daily', 'weekly', 'monthly', 'quarterly', 'yearly']):

            # for events times/dates separately needs ReferenceDataRequest (when specified)
            if 'release-date-time-full' in market_data_request.fields:

                # experimental!!
                # careful: make sure you copy the market data request object (when threading, altering that can
                # cause concurrency issues!)
                datetime_data_frame = self.get_reference_data(market_data_request_vendor, market_data_request)

                old_fields = copy.deepcopy(market_data_request.fields)
                old_vendor_fields = copy.deepcopy(market_data_request_vendor.fields)

                # remove fields 'release-date-time-full' from our request (and the associated field in the vendor)
                # if they are there
                try:
                    index = market_data_request.fields.index('release-date-time-full')

                    market_data_request.fields.pop(index)
                    market_data_request_vendor.fields.pop(index)
                except:
                    pass

                # download all the other event fields (uses HistoricalDataRequest to Bloomberg)
                # concatenate with date time fields
                if len(market_data_request_vendor.fields) > 0:
                    events_data_frame = self.get_daily_data(market_data_request, market_data_request_vendor)

                    col = events_data_frame.index.name
                    events_data_frame = events_data_frame.reset_index(drop = False)

                    data_frame = pandas.concat([events_data_frame, datetime_data_frame], axis = 1)
                    temp = data_frame[col]
                    del data_frame[col]
                    data_frame.index = temp
                else:
                    data_frame = datetime_data_frame

                market_data_request.fields = old_fields
                market_data_request_vendor.fields = old_vendor_fields

            # for all other daily/monthly/quarter data, we can use HistoricalDataRequest to Bloomberg
            else:
                data_frame = self.get_daily_data(market_data_request, market_data_request_vendor)

        # assume one ticker only
        # for intraday data we use IntradayDataRequest to Bloomberg
        if (market_data_request.freq in ['tick', 'intraday', 'second', 'minute', 'hourly']):
            market_data_request_vendor.tickers = market_data_request_vendor.tickers[0]

            if market_data_request.freq in ['tick', 'second']:
                data_frame = self.download_tick(market_data_request_vendor)
            else:
                data_frame = self.download_intraday(market_data_request_vendor)

            if data_frame is not None:
                if data_frame.empty:
                    try:
                        self.logger.info("No tickers returned for: " + market_data_request_vendor.tickers)
                    except:
                        pass

                    return None

                cols = data_frame.columns.values

                import pytz

                try:
                    data_frame = data_frame.tz_localize(pytz.utc)
                except:
                    data_frame = data_frame.tz_convert(pytz.utc)

                cols = market_data_request.tickers[0] + "." + cols
                data_frame.columns = cols

        self.logger.info("Completed request from Bloomberg.")

        return data_frame

    def get_daily_data(self, market_data_request, market_data_request_vendor):
        data_frame = self.download_daily(market_data_request_vendor)

        # convert from vendor to findatapy tickers/fields
        if data_frame is not None:
            if data_frame.empty:
                self.logger.info("No tickers returned for...")

                try:
                    self.logger.info(str(market_data_request_vendor.tickers))
                except: pass

                return None

            returned_fields = data_frame.columns.get_level_values(0)
            returned_tickers = data_frame.columns.get_level_values(1)

            # TODO if empty try downloading again a year later
            try:
                fields = self.translate_from_vendor_field(returned_fields, market_data_request)
            except:
                print('t')

            tickers = self.translate_from_vendor_ticker(returned_tickers, market_data_request)

            ticker_combined = []

            for i in range(0, len(fields)):
                ticker_combined.append(tickers[i] + "." + fields[i])

            data_frame.columns = ticker_combined
            data_frame.index.name = 'Date'

        return data_frame

    def get_reference_data(self, market_data_request_vendor, market_data_request):
        end = datetime.utcnow()

        from datetime import timedelta
        end = end + timedelta(days=365)# end.replace(year = end.year + 1)

        market_data_request_vendor.finish_date = end

        self.logger.debug("Requesting ref for " + market_data_request_vendor.tickers[0] + " etc.")

        data_frame = self.download_ref(market_data_request_vendor)

        self.logger.debug("Waiting for ref...")

        # convert from vendor to findatapy tickers/fields
        if data_frame is not None:
            returned_fields = data_frame.columns.get_level_values(0)
            returned_tickers = data_frame.columns.get_level_values(1)

        if data_frame is not None:
            # TODO if empty try downloading again a year later
            fields = self.translate_from_vendor_field(returned_fields, market_data_request)
            tickers = self.translate_from_vendor_ticker(returned_tickers, market_data_request)

            ticker_combined = []

            for i in range(0, len(fields)):
                ticker_combined.append(tickers[i] + "." + fields[i])

            data_frame.columns = ticker_combined

            # TODO coerce will be deprecated from pandas
            data_frame = data_frame.convert_objects(convert_dates = 'coerce', convert_numeric= 'coerce')

        return data_frame

    # implement method in abstract superclass
    @abc.abstractmethod
    def kill_session(self):
        return

    @abc.abstractmethod
    def download_tick(self, market_data_request):
        return

    @abc.abstractmethod
    def download_intraday(self, market_data_request):
        return

    @abc.abstractmethod
    def download_daily(self, market_data_request):
        return

    @abc.abstractmethod
    def download_ref(self, market_data_request):
        return
コード例 #5
0
class IOEngine(object):
    """Write and reads time series data to disk in various formats, CSV, HDF5 (fixed and table formats) and MongoDB/Arctic.

    Can be used to save down output of finmarketpy backtests and also to cache market data locally.

    Also supports BColz (but not currently stable). Planning to add other interfaces such as SQL etc.

    """
    def __init__(self):
        self.logger = LoggerManager().getLogger(__name__)

    ### functions to handle Excel on disk
    def write_time_series_to_excel(self,
                                   fname,
                                   sheet,
                                   data_frame,
                                   create_new=False):
        """Writes Pandas data frame to disk in Excel format

        Parameters
        ----------
        fname : str
            Excel filename to be written to
        sheet : str
            sheet in excel
        data_frame : DataFrame
            data frame to be written
        create_new : boolean
            to create a new Excel file
        """

        if (create_new):
            writer = pandas.ExcelWriter(fname, engine='xlsxwriter')
        else:
            if os.path.isfile(fname):
                book = load_workbook(fname)
                writer = pandas.ExcelWriter(fname, engine='xlsxwriter')
                writer.book = book
                writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
            else:
                writer = pandas.ExcelWriter(fname, engine='xlsxwriter')

        data_frame.to_excel(writer, sheet_name=sheet, engine='xlsxwriter')

        writer.save()
        writer.close()

    def write_time_series_to_excel_writer(self, writer, sheet, data_frame):
        """Writes Pandas data frame to disk in Excel format for a writer

        Parameters
        ----------
        writer : ExcelWriter
            File handle to use for writing Excel file to disk
        sheet : str
            sheet in excel
        data_frame : DataFrame
            data frame to be written
        """
        data_frame.to_excel(writer, sheet, engine='xlsxwriter')

    def read_excel_data_frame(self,
                              f_name,
                              excel_sheet,
                              freq,
                              cutoff=None,
                              dateparse=None,
                              postfix='.close',
                              intraday_tz='UTC'):
        """Reads Excel from disk into DataFrame

        Parameters
        ----------
        f_name : str
            Excel file path to read
        freq : str
            Frequency of data to read (intraday/daily etc)
        cutoff : DateTime (optional)
            end date to read up to
        dateparse : str (optional)
            date parser to use
        postfix : str (optional)
            postfix to add to each columns
        intraday_tz : str
            timezone of file if uses intraday data

        Returns
        -------
        DataFrame
        """

        return self.read_csv_data_frame(f_name,
                                        freq,
                                        cutoff=cutoff,
                                        dateparse=dateparse,
                                        postfix=postfix,
                                        intraday_tz=intraday_tz,
                                        excel_sheet=excel_sheet)

    def remove_time_series_cache_on_disk(self,
                                         fname,
                                         engine='hdf5_fixed',
                                         db_server='127.0.0.1',
                                         db_port='6379',
                                         timeout=10,
                                         username=None,
                                         password=None):

        if 'hdf5' in engine:
            engine = 'hdf5'

        if (engine == 'bcolz'):
            # convert invalid characters to substitutes (which Bcolz can't deal with)
            pass
        elif (engine == 'redis'):
            import redis

            fname = os.path.basename(fname).replace('.', '_')

            try:
                r = redis.StrictRedis(host=db_server,
                                      port=db_port,
                                      db=0,
                                      socket_timeout=timeout,
                                      socket_connect_timeout=timeout)

                if (fname == 'flush_all_keys'):
                    r.flushall()
                else:
                    # allow deletion of keys by pattern matching
                    if "*" in fname:
                        x = r.keys(fname)

                        if len(x) > 0:
                            r.delete(x)

                    r.delete(fname)

            except Exception as e:
                self.logger.warning("Cannot delete non-existent key " + fname +
                                    " in Redis: " + str(e))

        elif (engine == 'arctic'):
            from arctic import Arctic
            import pymongo

            socketTimeoutMS = 30 * 1000
            fname = os.path.basename(fname).replace('.', '_')

            self.logger.info('Load MongoDB library: ' + fname)

            if username is not None and password is not None:
                c = pymongo.MongoClient(
                    host="mongodb://" + username + ":" + password + "@" +
                    str(db_server) + ":" + str(db_port),
                    connect=False)  # , username=username, password=password)
            else:
                c = pymongo.MongoClient(host="mongodb://" + str(db_server) +
                                        ":" + str(db_port),
                                        connect=False)

            store = Arctic(c,
                           socketTimeoutMS=socketTimeoutMS,
                           serverSelectionTimeoutMS=socketTimeoutMS,
                           connectTimeoutMS=socketTimeoutMS)

            store.delete_library(fname)

            c.close()

            self.logger.info("Deleted MongoDB library: " + fname)

        elif (engine == 'hdf5'):
            h5_filename = self.get_h5_filename(fname)

            # delete the old copy
            try:
                os.remove(h5_filename)
            except:
                pass

    ### functions to handle HDF5 on disk
    def write_time_series_cache_to_disk(self,
                                        fname,
                                        data_frame,
                                        engine='hdf5_fixed',
                                        append_data=False,
                                        db_server=DataConstants().db_server,
                                        db_port=DataConstants().db_port,
                                        username=None,
                                        password=None,
                                        filter_out_matching=None,
                                        timeout=10):
        """Writes Pandas data frame to disk as HDF5 format or bcolz format or in Arctic

        Parmeters
        ---------
        fname : str
            path of file
        data_frame : DataFrame
            data frame to be written to disk
        engine : str
            'hdf5_fixed' - use HDF5 fixed format, very quick, but cannot append to this
            'hdf5_table' - use HDF5 table format, slower but can append to
            'parquet' - use Parquet
            'arctic' - use Arctic/MongoDB database
            'redis' - use Redis
        append_data : bool
            False - write a fresh copy of data on disk each time
            True - append data to disk
        db_server : str
            Database server for arctic (default: '127.0.0.1')
        timeout : int
            Number of seconds to do timeout
        """

        # default HDF5 format
        hdf5_format = 'fixed'

        if 'hdf5' in engine:
            hdf5_format = engine.split('_')[1]
            engine = 'hdf5'

        if (engine == 'bcolz'):
            # convert invalid characters to substitutes (which Bcolz can't deal with)
            data_frame.columns = self.find_replace_chars(
                data_frame.columns, _invalid_chars, _replace_chars)
            data_frame.columns = ['A_' + x for x in data_frame.columns]

            data_frame['DTS_'] = pandas.to_datetime(data_frame.index,
                                                    unit='ns')

            bcolzpath = self.get_bcolz_filename(fname)
            shutil.rmtree(bcolzpath, ignore_errors=True)
            zlens = bcolz.ctable.fromdataframe(data_frame, rootdir=bcolzpath)
        elif (engine == 'redis'):
            import redis

            fname = os.path.basename(fname).replace('.', '_')

            try:
                r = redis.StrictRedis(host=db_server,
                                      port=db_port,
                                      db=0,
                                      socket_timeout=timeout,
                                      socket_connect_timeout=timeout)

                if isinstance(data_frame, pandas.DataFrame):
                    r.set(fname, data_frame.to_msgpack(compress='blosc'))

                self.logger.debug("Pushed " + fname + " to Redis")
            except Exception as e:
                self.logger.warning("Couldn't push " + fname + " to Redis: " +
                                    str(e))

        elif (engine == 'arctic'):
            from arctic import Arctic
            import pymongo

            socketTimeoutMS = 30 * 1000
            fname = os.path.basename(fname).replace('.', '_')

            self.logger.info('Load Arctic/MongoDB library: ' + fname)

            if username is not None and password is not None:
                c = pymongo.MongoClient(
                    host="mongodb://" + username + ":" + password + "@" +
                    str(db_server) + ":" + str(db_port),
                    connect=False)  # , username=username, password=password)
            else:
                c = pymongo.MongoClient(host="mongodb://" + str(db_server) +
                                        ":" + str(db_port),
                                        connect=False)

            store = Arctic(c,
                           socketTimeoutMS=socketTimeoutMS,
                           serverSelectionTimeoutMS=socketTimeoutMS,
                           connectTimeoutMS=socketTimeoutMS)

            database = None

            try:
                database = store[fname]
            except:
                pass

            if database is None:
                store.initialize_library(fname, audit=False)
                self.logger.info("Created MongoDB library: " + fname)
            else:
                self.logger.info("Got MongoDB library: " + fname)

            # Access the library
            library = store[fname]

            if ('intraday' in fname):
                data_frame = data_frame.astype('float32')

            if filter_out_matching is not None:
                cols = data_frame.columns

                new_cols = []

                for col in cols:
                    if filter_out_matching not in col:
                        new_cols.append(col)

                data_frame = data_frame[new_cols]

            # problems with Arctic when writing timezone to disk sometimes, so strip
            data_frame = data_frame.copy().tz_localize(None)

            # can duplicate values if we have existing dates
            if append_data:
                library.append(fname, data_frame)
            else:
                library.write(fname, data_frame)

            c.close()

            self.logger.info("Written MongoDB library: " + fname)

        elif (engine == 'hdf5'):
            h5_filename = self.get_h5_filename(fname)

            # append data only works for HDF5 stored as tables (but this is much slower than fixed format)
            # removes duplicated entries at the end
            if append_data:
                store = pandas.HDFStore(h5_filename,
                                        format=hdf5_format,
                                        complib="blosc",
                                        complevel=9)

                if ('intraday' in fname):
                    data_frame = data_frame.astype('float32')

                # get last row which matches and remove everything after that (because append
                # function doesn't check for duplicated rows
                nrows = len(store['data'].index)
                last_point = data_frame.index[-1]

                i = nrows - 1

                while (i > 0):
                    read_index = store.select('data', start=i,
                                              stop=nrows).index[0]

                    if (read_index <= last_point): break

                    i = i - 1

                # remove rows at the end, which are duplicates of the incoming time series
                store.remove(key='data', start=i, stop=nrows)
                store.put(key='data',
                          value=data_frame,
                          format=hdf5_format,
                          append=True)
                store.close()
            else:
                h5_filename_temp = self.get_h5_filename(fname + ".temp")

                # delete the old copy
                try:
                    os.remove(h5_filename_temp)
                except:
                    pass

                store = pandas.HDFStore(h5_filename_temp,
                                        format=hdf5_format,
                                        complib="blosc",
                                        complevel=9)

                if ('intraday' in fname):
                    data_frame = data_frame.astype('float32')

                store.put(key='data', value=data_frame, format=hdf5_format)
                store.close()

                # delete the old copy
                try:
                    os.remove(h5_filename)
                except:
                    pass

                # once written to disk rename
                os.rename(h5_filename_temp, h5_filename)

            self.logger.info("Written HDF5: " + fname)

        elif (engine == 'parquet'):
            if fname[-5:] != '.gzip':
                fname = fname + '.gzip'

            data_frame.to_parquet(fname, compression='gzip')

            self.logger.info("Written Parquet: " + fname)

    def get_h5_filename(self, fname):
        """Strips h5 off filename returning first portion of filename

        Parameters
        ----------
        fname : str
            h5 filename to strip

        Returns
        -------
        str
        """
        if fname[-3:] == '.h5':
            return fname

        return fname + ".h5"

    def get_bcolz_filename(self, fname):
        """Strips bcolz off filename returning first portion of filename

        Parameters
        ----------
        fname : str
            bcolz filename to strip

        Returns
        -------
        str
        """
        if fname[-6:] == '.bcolz':
            return fname

        return fname + ".bcolz"

    def write_r_compatible_hdf_dataframe(self, data_frame, fname, fields=None):
        """Write a DataFrame to disk in as an R compatible HDF5 file.

        Parameters
        ----------
        data_frame : DataFrame
            data frame to be written
        fname : str
            file path to be written
        fields : list(str)
            columns to be written
        """
        fname_r = self.get_h5_filename(fname)

        self.logger.info("About to dump R binary HDF5 - " + fname_r)
        data_frame32 = data_frame.astype('float32')

        if fields is None:
            fields = data_frame32.columns.values

        # decompose date/time into individual fields (easier to pick up in R)
        data_frame32['Year'] = data_frame.index.year
        data_frame32['Month'] = data_frame.index.month
        data_frame32['Day'] = data_frame.index.day
        data_frame32['Hour'] = data_frame.index.hour
        data_frame32['Minute'] = data_frame.index.minute
        data_frame32['Second'] = data_frame.index.second
        data_frame32['Millisecond'] = data_frame.index.microsecond / 1000

        data_frame32 = data_frame32[[
            'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'Millisecond'
        ] + fields]

        cols = data_frame32.columns

        store_export = pandas.HDFStore(fname_r)
        store_export.put('df_for_r', data_frame32, data_columns=cols)
        store_export.close()

    def read_time_series_cache_from_disk(self,
                                         fname,
                                         engine='hdf5',
                                         start_date=None,
                                         finish_date=None,
                                         db_server=DataConstants().db_server,
                                         db_port=DataConstants().db_port,
                                         username=None,
                                         password=None):
        """Reads time series cache from disk in either HDF5 or bcolz

        Parameters
        ----------
        fname : str (or list)
            file to be read from
        engine : str (optional)
            'hd5' - reads HDF5 files (default)
            'arctic' - reads from Arctic/MongoDB database
            'bcolz' = reads from bcolz file (not fully implemented)
        start_date : str/datetime (optional)
            Start date
        finish_date : str/datetime (optional)
            Finish data
        db_server : str
            IP address of MongdDB (default '127.0.0.1')

        Returns
        -------
        DataFrame
        """

        logger = LoggerManager.getLogger(__name__)

        data_frame_list = []

        if not (isinstance(fname, list)):
            if '*' in fname:
                fname = glob.glob(fname)
            else:
                fname = [fname]

        for fname_single in fname:
            logger.debug("Reading " + fname_single + "..")

            if (engine == 'bcolz'):
                try:
                    name = self.get_bcolz_filename(fname_single)
                    zlens = bcolz.open(rootdir=name)
                    data_frame = zlens.todataframe()

                    data_frame.index = pandas.DatetimeIndex(data_frame['DTS_'])
                    data_frame.index.name = 'Date'
                    del data_frame['DTS_']

                    # convert invalid characters (which Bcolz can't deal with) to more readable characters for pandas
                    data_frame.columns = self.find_replace_chars(
                        data_frame.columns, _replace_chars, _invalid_chars)
                    data_frame.columns = [x[2:] for x in data_frame.columns]
                except:
                    data_frame = None

            elif (engine == 'redis'):
                import redis

                fname_single = os.path.basename(fname_single).replace('.', '_')

                msg = None

                try:
                    r = redis.StrictRedis(host=db_server, port=db_port, db=0)
                    msg = r.get(fname_single)

                except:
                    self.logger.info("Cache not existent for " + fname_single +
                                     " in Redis")

                if msg is None:
                    data_frame = None
                else:

                    self.logger.info('Load Redis cache: ' + fname_single)

                    data_frame = pandas.read_msgpack(msg)

            elif (engine == 'arctic'):
                socketTimeoutMS = 2 * 1000

                import pymongo
                from arctic import Arctic

                fname_single = os.path.basename(fname_single).replace('.', '_')

                self.logger.info('Load Arctic/MongoDB library: ' +
                                 fname_single)

                if username is not None and password is not None:
                    c = pymongo.MongoClient(
                        host="mongodb://" + username + ":" + password + "@" +
                        str(db_server) + ":" + str(db_port),
                        connect=False
                    )  # , username=username, password=password)
                else:
                    c = pymongo.MongoClient(host="mongodb://" +
                                            str(db_server) + ":" +
                                            str(db_port),
                                            connect=False)

                store = Arctic(c,
                               socketTimeoutMS=socketTimeoutMS,
                               serverSelectionTimeoutMS=socketTimeoutMS)

                # Access the library
                try:
                    library = store[fname_single]

                    if start_date is None and finish_date is None:
                        item = library.read(fname_single)

                    else:
                        from arctic.date import DateRange
                        item = library.read(
                            fname_single,
                            date_range=DateRange(
                                start_date.replace(tzinfo=None),
                                finish_date.replace(tzinfo=None)))

                    c.close()

                    self.logger.info('Read ' + fname_single)

                    data_frame = item.data

                except Exception as e:
                    self.logger.warning(
                        'Library may not exist or another error: ' +
                        fname_single + ' & message is ' + str(e))
                    data_frame = None

            elif os.path.isfile(self.get_h5_filename(fname_single)):
                store = pandas.HDFStore(self.get_h5_filename(fname_single))
                data_frame = store.select("data")

                if ('intraday' in fname_single):
                    data_frame = data_frame.astype('float32')

                store.close()

            elif os.path.isfile(fname_single):
                data_frame = pandas.read_parquet(fname_single)

            data_frame_list.append(data_frame)

        if len(data_frame_list) == 1:
            return data_frame_list[0]

        return data_frame_list

    ### functions for CSV reading and writing
    def write_time_series_to_csv(self, csv_path, data_frame):
        data_frame.to_csv(csv_path)

    def read_csv_data_frame(self,
                            f_name,
                            freq,
                            cutoff=None,
                            dateparse=None,
                            postfix='.close',
                            intraday_tz='UTC',
                            excel_sheet=None):
        """Reads CSV/Excel from disk into DataFrame

        Parameters
        ----------
        f_name : str
            CSV/Excel file path to read
        freq : str
            Frequency of data to read (intraday/daily etc)
        cutoff : DateTime (optional)
            end date to read up to
        dateparse : str (optional)
            date parser to use
        postfix : str (optional)
            postfix to add to each columns
        intraday_tz : str (optional)
            timezone of file if uses intraday data
        excel_sheet : str (optional)
            Excel sheet to be read

        Returns
        -------
        DataFrame
        """

        if (freq == 'intraday'):

            if dateparse is None:
                dateparse = lambda x: datetime.datetime(*map(
                    int,
                    [x[6:10], x[3:5], x[0:2], x[11:13], x[14:16], x[17:19]]))
            elif dateparse is 'dukascopy':
                dateparse = lambda x: datetime.datetime(*map(
                    int,
                    [x[0:4], x[5:7], x[8:10], x[11:13], x[14:16], x[17:19]]))
            elif dateparse is 'c':
                # use C library for parsing dates, several hundred times quicker
                # requires compilation of library to install
                import ciso8601
                dateparse = lambda x: ciso8601.parse_datetime(x)

            if excel_sheet is None:
                data_frame = pandas.read_csv(f_name,
                                             index_col=0,
                                             parse_dates=True,
                                             date_parser=dateparse)
            else:
                data_frame = pandas.read_excel(f_name,
                                               excel_sheet,
                                               index_col=0,
                                               na_values=['NA'])

            data_frame = data_frame.astype('float32')
            data_frame.index.names = ['Date']

            old_cols = data_frame.columns
            new_cols = []

            # add '.close' to each column name
            for col in old_cols:
                new_cols.append(col + postfix)

            data_frame.columns = new_cols
        else:
            # daily data
            if 'events' in f_name:

                data_frame = pandas.read_csv(f_name)

                # very slow conversion
                data_frame = data_frame.convert_objects(convert_dates='coerce')

            else:
                if excel_sheet is None:
                    try:
                        data_frame = pandas.read_csv(f_name,
                                                     index_col=0,
                                                     parse_dates=["DATE"],
                                                     date_parser=dateparse)
                    except:
                        data_frame = pandas.read_csv(f_name,
                                                     index_col=0,
                                                     parse_dates=["Date"],
                                                     date_parser=dateparse)
                else:
                    data_frame = pandas.read_excel(f_name,
                                                   excel_sheet,
                                                   index_col=0,
                                                   na_values=['NA'])

        # convert Date to Python datetime
        # datetime data_frame['Date1'] = data_frame.index

        # slower method: lambda x: pandas.datetime.strptime(x, '%d/%m/%Y %H:%M:%S')
        # data_frame['Date1'].apply(lambda x: datetime.datetime(int(x[6:10]), int(x[3:5]), int(x[0:2]),
        #                                        int(x[12:13]), int(x[15:16]), int(x[18:19])))

        # data_frame.index = data_frame['Date1']
        # data_frame.drop('Date1')

        # slower method: data_frame.index = pandas.to_datetime(data_frame.index)

        if (freq == 'intraday'):
            # assume time series are already in UTC and assign this (can specify other time zones)
            data_frame = data_frame.tz_localize(intraday_tz)

        # end cutoff date
        if cutoff is not None:
            if (isinstance(cutoff, str)):
                cutoff = parse(cutoff)

            data_frame = data_frame.loc[data_frame.index < cutoff]

        return data_frame

    def find_replace_chars(self, array, to_find, replace_with):

        for i in range(0, len(to_find)):
            array = [x.replace(to_find[i], replace_with[i]) for x in array]

        return array

    def convert_csv_data_frame(self,
                               f_name,
                               category,
                               freq,
                               cutoff=None,
                               dateparse=None):
        """Converts CSV file to HDF5 file

        Parameters
        ----------
        f_name : str
            File name to be read
        category : str
            data category of file (used in HDF5 filename)
        freq : str
            intraday/daily frequency (used in HDF5 filename)
        cutoff : DateTime (optional)
            filter dates up to here
        dateparse : str
            date parser to use
        """

        self.logger.info("About to read... " + f_name)

        data_frame = self.read_csv_data_frame(f_name,
                                              freq,
                                              cutoff=cutoff,
                                              dateparse=dateparse)

        category_f_name = self.create_cache_file_name(category)

        self.write_time_series_cache_to_disk(category_f_name, data_frame)

    def clean_csv_file(self, f_name):
        """Cleans up CSV file (removing empty characters) before writing back to disk

        Parameters
        ----------
        f_name : str
            CSV file to be cleaned
        """

        with codecs.open(f_name, 'rb', 'utf-8') as myfile:
            data = myfile.read()

            # clean file first if dirty
            if data.count('\x00'):
                self.logger.info('Cleaning CSV...')

                with codecs.open(f_name + '.tmp', 'w', 'utf-8') as of:
                    of.write(data.replace('\x00', ''))

                shutil.move(f_name + '.tmp', f_name)

    def create_cache_file_name(self, filename):
        return DataConstants().folder_time_series_data + "/" + filename

    # TODO refactor IOEngine so that each database is implemented in a subclass of DBEngine

    def get_engine(self, engine='hdf5_fixed'):
        pass