def force_type_conversion(self, data_frame): constants = DataConstants() logger = LoggerManager().getLogger(__name__) if data_frame is not None: if not (data_frame.empty): # Need to convert numerical and datetime columns separately # post pandas 0.23 for c in data_frame.columns: is_date = False # Special case for ECO_RELEASE_DT / FIRST_REVISION_DATE if 'ECO_RELEASE_DT' in c or 'FIRST_REVISION_DATE' in c: try: temp_col = [] # data_frame[c].values for i in range(0, len(data_frame[c].values)): try: temp_col.append( pd.to_datetime(str( int(data_frame[c].values[i])), format='%Y%m%d')) except: temp_col.append(np.datetime64('NaT')) data_frame[c] = temp_col except Exception as e: logger.warning( "Couldn't convert " + str(c) + " to date.. was this column empty? " + str(e)) else: # Only convert those Bloomberg reference fields to # dates which have been listed explicitly for d in constants.always_date_columns: if d in c: try: data_frame[c] = pd.to_datetime( data_frame[c], errors='coerce') is_date = True break except: pass # Otherwise this is not a date field so attempt to # convert into numbers if not (is_date): try: data_frame[c] = pd.to_numeric(data_frame[c], errors='ignore') except: pass logger.debug("Returning converted dataframe...") return data_frame
def get_reference_data(self, md_request_vendor, md_request): logger = LoggerManager().getLogger(__name__) constants = DataConstants() end = datetime.utcnow() from datetime import timedelta end = end + timedelta( days=365) # because very often we may with to download data about # future calendar events # end.replace(year = end.year + 1) md_request_vendor.finish_date = end logger.debug("Requesting ref for " + md_request_vendor.tickers[0] + " etc.") data_frame = self.download_ref(md_request_vendor) logger.debug("Waiting for ref...") # Convert from vendor to findatapy tickers/fields if data_frame is not None: if data_frame.empty: return None returned_fields = data_frame.columns.get_level_values(0) returned_tickers = data_frame.columns.get_level_values(1) if data_frame is not None: # TODO if empty try downloading again a year later fields = self.translate_from_vendor_field(returned_fields, md_request) tickers = self.translate_from_vendor_ticker( returned_tickers, md_request) ticker_combined = [] for i in range(0, len(fields)): ticker_combined.append(tickers[i] + "." + fields[i]) data_frame.columns = ticker_combined # Need to convert numerical and datetime columns separately post # pandas 0.23 data_frame = self.force_type_conversion(data_frame) # data_frame = data_frame.apply(pd.to_datetime, errors='ignore') # data_frame = data_frame.apply(pd.to_numeric, errors='ignore') # TODO coerce will be deprecated from pandas 0.23.0 onwards) so # remove! # data_frame = data_frame.convert_objects(convert_dates = 'coerce', # convert_numeric= 'coerce') return data_frame
class DataVendorBBG(DataVendor): """Abstract class for download of Bloomberg daily, intraday data and reference data. Implemented by: DataVendorBBGOpen - Adapted version of new Bloomberg Open API for Python which is recommended. Note that this requires compilation, via installed C++ compiler. For Python 3.5, this is Microsoft Visual Studio 2015. Or it is easier to install blpapi via conda Note: no longer supports COM API, which is slower and only 32 bit """ # these fields are BDS style fields to be downloaded using Bloomberg's Reference Data interface list_of_ref_fields = [ 'release-date-time-full', 'last-tradeable-day', 'futures-chain-tickers', 'futures-chain-last-trade-dates', 'first-notice-date', 'first-tradeable-day', 'cal-non-settle-dates' ] list_of_ref_vendor_fields = [ 'ECO_FUTURE_RELEASE_DATE_LIST', 'LAST_TRADEABLE_DT', 'FUT_CHAIN', 'FUT_CHAIN_LAST_TRADE_DATES', 'FUT_NOTICE_FIRST', 'FUT_FIRST_TRADE_DT', 'CALENDAR_NON_SETTLEMENT_DATES' ] def __init__(self): super(DataVendorBBG, self).__init__() self.logger = LoggerManager().getLogger(__name__) # implement method in abstract superclass def load_ticker(self, market_data_request): """Retrieves market data from external data source (in this case Bloomberg) Parameters ---------- market_data_request : MarketDataRequest contains all the various parameters detailing time series start and finish, tickers etc Returns ------- DataFrame """ market_data_request = MarketDataRequest(md_request=market_data_request) market_data_request_vendor = self.construct_vendor_market_data_request( market_data_request) data_frame = None self.logger.info("Request Bloomberg data") # do we need daily or intraday data? if (market_data_request.freq in ['daily', 'weekly', 'monthly', 'quarterly', 'yearly']): # work out the fields which need to be downloaded via Bloomberg ref request (BDP) and # those that can be downloaded via Historical request (BDH) ref_fields = [] ref_vendor_fields = [] for i in range(0, len(market_data_request.fields)): if market_data_request.fields[i] in self.list_of_ref_fields \ or market_data_request_vendor.fields[i] in self.list_of_ref_vendor_fields: ref_fields.append(market_data_request.fields[i]) ref_vendor_fields.append( market_data_request_vendor.fields[i]) non_ref_fields = [] non_ref_vendor_fields = [] for i in range(0, len(market_data_request.fields)): if market_data_request.fields[i] not in self.list_of_ref_fields \ and market_data_request_vendor.fields[i] not in self.list_of_ref_vendor_fields: non_ref_fields.append(market_data_request.fields[i]) non_ref_vendor_fields.append( market_data_request_vendor.fields[i]) # for certain cases, need to use ReferenceDataRequest # eg. for events times/dates, last tradeable date fields (when specified) if len(ref_fields) > 0: # careful: make sure you copy the market data request object (when threading, altering that can # cause concurrency issues!) old_fields = copy.deepcopy(market_data_request.fields) old_vendor_fields = copy.deepcopy( market_data_request_vendor.fields) # market_data_request = MarketDataRequest(md_request=market_data_request_copy) market_data_request.fields = ref_fields market_data_request.vendor_fields = ref_vendor_fields market_data_request_vendor = self.construct_vendor_market_data_request( market_data_request) # just select those reference fields to download via reference datetime_data_frame = self.get_reference_data( market_data_request_vendor, market_data_request) # download all the other event or non-ref fields (uses HistoricalDataRequest to Bloomberg) # concatenate with date time fields if len(non_ref_fields) > 0: market_data_request.fields = non_ref_fields market_data_request.vendor_fields = non_ref_vendor_fields market_data_request_vendor = self.construct_vendor_market_data_request( market_data_request) events_data_frame = self.get_daily_data( market_data_request, market_data_request_vendor) col = events_data_frame.index.name events_data_frame = events_data_frame.reset_index( drop=False) data_frame = pandas.concat( [events_data_frame, datetime_data_frame], axis=1) temp = data_frame[col] del data_frame[col] data_frame.index = temp else: data_frame = datetime_data_frame market_data_request.fields = copy.deepcopy(old_fields) market_data_request_vendor.fields = copy.deepcopy( old_vendor_fields) # for all other daily/monthly/quarter data, we can use HistoricalDataRequest to Bloomberg else: data_frame = self.get_daily_data(market_data_request, market_data_request_vendor) try: # convert fields with release-dt to dates (special case!) for c in data_frame.columns: if 'release-dt' in c: data_frame[c] = ( data_frame[c]).astype('int').astype(str).apply( lambda x: pandas.to_datetime( x, format='%Y%m%d')) except: pass # assume one ticker only for intraday data and use IntradayDataRequest to Bloomberg if (market_data_request.freq in ['tick', 'intraday', 'second', 'minute', 'hourly']): market_data_request_vendor.tickers = market_data_request_vendor.tickers[ 0] if market_data_request.freq in ['tick', 'second']: data_frame = self.download_tick(market_data_request_vendor) else: data_frame = self.download_intraday(market_data_request_vendor) if data_frame is not None: if data_frame.empty: try: self.logger.info("No tickers returned for: " + market_data_request_vendor.tickers) except: pass return None cols = data_frame.columns.values import pytz try: data_frame = data_frame.tz_localize(pytz.utc) except: data_frame = data_frame.tz_convert(pytz.utc) cols = market_data_request.tickers[0] + "." + cols data_frame.columns = cols self.logger.info("Completed request from Bloomberg.") return data_frame def get_daily_data(self, market_data_request, market_data_request_vendor): data_frame = self.download_daily(market_data_request_vendor) # convert from vendor to findatapy tickers/fields if data_frame is not None: if data_frame.empty: self.logger.info("No tickers returned for...") try: self.logger.info(str(market_data_request_vendor.tickers)) except: pass return None returned_fields = data_frame.columns.get_level_values(0) returned_tickers = data_frame.columns.get_level_values(1) # TODO if empty try downloading again a year later try: fields = self.translate_from_vendor_field( returned_fields, market_data_request) except: print('t') tickers = self.translate_from_vendor_ticker( returned_tickers, market_data_request) ticker_combined = [] for i in range(0, len(fields)): ticker_combined.append(tickers[i] + "." + fields[i]) data_frame.columns = ticker_combined data_frame.index.name = 'Date' return data_frame def get_reference_data(self, market_data_request_vendor, market_data_request): end = datetime.utcnow() from datetime import timedelta end = end + timedelta( days=365 ) # because very often we may with to download data about future calendar events # end.replace(year = end.year + 1) market_data_request_vendor.finish_date = end self.logger.debug("Requesting ref for " + market_data_request_vendor.tickers[0] + " etc.") data_frame = self.download_ref(market_data_request_vendor) self.logger.debug("Waiting for ref...") # convert from vendor to findatapy tickers/fields if data_frame is not None: if data_frame.empty: return None returned_fields = data_frame.columns.get_level_values(0) returned_tickers = data_frame.columns.get_level_values(1) if data_frame is not None: # TODO if empty try downloading again a year later fields = self.translate_from_vendor_field(returned_fields, market_data_request) tickers = self.translate_from_vendor_ticker( returned_tickers, market_data_request) ticker_combined = [] for i in range(0, len(fields)): ticker_combined.append(tickers[i] + "." + fields[i]) data_frame.columns = ticker_combined # need to convert numerical and datetime columns separately post pandas 0.23 data_frame = data_frame.apply(pandas.to_numeric, errors='ignore') data_frame = data_frame.apply(pandas.to_datetime, errors='ignore') # TODO coerce will be deprecated from pandas 0.23.0 onwards) so remove! # data_frame = data_frame.convert_objects(convert_dates = 'coerce', convert_numeric= 'coerce') return data_frame # implement method in abstract superclass @abc.abstractmethod def kill_session(self): return @abc.abstractmethod def download_tick(self, market_data_request): return @abc.abstractmethod def download_intraday(self, market_data_request): return @abc.abstractmethod def download_daily(self, market_data_request): return @abc.abstractmethod def download_ref(self, market_data_request): return
class DataVendorBBG(DataVendor): def __init__(self): super(DataVendorBBG, self).__init__() self.logger = LoggerManager().getLogger(__name__) # implement method in abstract superclass def load_ticker(self, market_data_request): """ load_ticker - Retrieves market data from external data source (in this case Bloomberg) Parameters ---------- market_data_request : MarketDataRequest contains all the various parameters detailing time series start and finish, tickers etc Returns ------- DataFrame """ market_data_request_vendor = self.construct_vendor_market_data_request(market_data_request) data_frame = None self.logger.info("Request Bloomberg data") # do we need daily or intraday data? if (market_data_request.freq in ['daily', 'weekly', 'monthly', 'quarterly', 'yearly']): # for events times/dates separately needs ReferenceDataRequest (when specified) if 'release-date-time-full' in market_data_request.fields: # experimental!! # careful: make sure you copy the market data request object (when threading, altering that can # cause concurrency issues!) datetime_data_frame = self.get_reference_data(market_data_request_vendor, market_data_request) old_fields = copy.deepcopy(market_data_request.fields) old_vendor_fields = copy.deepcopy(market_data_request_vendor.fields) # remove fields 'release-date-time-full' from our request (and the associated field in the vendor) # if they are there try: index = market_data_request.fields.index('release-date-time-full') market_data_request.fields.pop(index) market_data_request_vendor.fields.pop(index) except: pass # download all the other event fields (uses HistoricalDataRequest to Bloomberg) # concatenate with date time fields if len(market_data_request_vendor.fields) > 0: events_data_frame = self.get_daily_data(market_data_request, market_data_request_vendor) col = events_data_frame.index.name events_data_frame = events_data_frame.reset_index(drop = False) data_frame = pandas.concat([events_data_frame, datetime_data_frame], axis = 1) temp = data_frame[col] del data_frame[col] data_frame.index = temp else: data_frame = datetime_data_frame market_data_request.fields = old_fields market_data_request_vendor.fields = old_vendor_fields # for all other daily/monthly/quarter data, we can use HistoricalDataRequest to Bloomberg else: data_frame = self.get_daily_data(market_data_request, market_data_request_vendor) # assume one ticker only # for intraday data we use IntradayDataRequest to Bloomberg if (market_data_request.freq in ['tick', 'intraday', 'second', 'minute', 'hourly']): market_data_request_vendor.tickers = market_data_request_vendor.tickers[0] if market_data_request.freq in ['tick', 'second']: data_frame = self.download_tick(market_data_request_vendor) else: data_frame = self.download_intraday(market_data_request_vendor) if data_frame is not None: if data_frame.empty: try: self.logger.info("No tickers returned for: " + market_data_request_vendor.tickers) except: pass return None cols = data_frame.columns.values import pytz try: data_frame = data_frame.tz_localize(pytz.utc) except: data_frame = data_frame.tz_convert(pytz.utc) cols = market_data_request.tickers[0] + "." + cols data_frame.columns = cols self.logger.info("Completed request from Bloomberg.") return data_frame def get_daily_data(self, market_data_request, market_data_request_vendor): data_frame = self.download_daily(market_data_request_vendor) # convert from vendor to findatapy tickers/fields if data_frame is not None: if data_frame.empty: self.logger.info("No tickers returned for...") try: self.logger.info(str(market_data_request_vendor.tickers)) except: pass return None returned_fields = data_frame.columns.get_level_values(0) returned_tickers = data_frame.columns.get_level_values(1) # TODO if empty try downloading again a year later try: fields = self.translate_from_vendor_field(returned_fields, market_data_request) except: print('t') tickers = self.translate_from_vendor_ticker(returned_tickers, market_data_request) ticker_combined = [] for i in range(0, len(fields)): ticker_combined.append(tickers[i] + "." + fields[i]) data_frame.columns = ticker_combined data_frame.index.name = 'Date' return data_frame def get_reference_data(self, market_data_request_vendor, market_data_request): end = datetime.utcnow() from datetime import timedelta end = end + timedelta(days=365)# end.replace(year = end.year + 1) market_data_request_vendor.finish_date = end self.logger.debug("Requesting ref for " + market_data_request_vendor.tickers[0] + " etc.") data_frame = self.download_ref(market_data_request_vendor) self.logger.debug("Waiting for ref...") # convert from vendor to findatapy tickers/fields if data_frame is not None: returned_fields = data_frame.columns.get_level_values(0) returned_tickers = data_frame.columns.get_level_values(1) if data_frame is not None: # TODO if empty try downloading again a year later fields = self.translate_from_vendor_field(returned_fields, market_data_request) tickers = self.translate_from_vendor_ticker(returned_tickers, market_data_request) ticker_combined = [] for i in range(0, len(fields)): ticker_combined.append(tickers[i] + "." + fields[i]) data_frame.columns = ticker_combined # TODO coerce will be deprecated from pandas data_frame = data_frame.convert_objects(convert_dates = 'coerce', convert_numeric= 'coerce') return data_frame # implement method in abstract superclass @abc.abstractmethod def kill_session(self): return @abc.abstractmethod def download_tick(self, market_data_request): return @abc.abstractmethod def download_intraday(self, market_data_request): return @abc.abstractmethod def download_daily(self, market_data_request): return @abc.abstractmethod def download_ref(self, market_data_request): return
class IOEngine(object): """Write and reads time series data to disk in various formats, CSV, HDF5 (fixed and table formats) and MongoDB/Arctic. Can be used to save down output of finmarketpy backtests and also to cache market data locally. Also supports BColz (but not currently stable). Planning to add other interfaces such as SQL etc. """ def __init__(self): self.logger = LoggerManager().getLogger(__name__) ### functions to handle Excel on disk def write_time_series_to_excel(self, fname, sheet, data_frame, create_new=False): """Writes Pandas data frame to disk in Excel format Parameters ---------- fname : str Excel filename to be written to sheet : str sheet in excel data_frame : DataFrame data frame to be written create_new : boolean to create a new Excel file """ if (create_new): writer = pandas.ExcelWriter(fname, engine='xlsxwriter') else: if os.path.isfile(fname): book = load_workbook(fname) writer = pandas.ExcelWriter(fname, engine='xlsxwriter') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) else: writer = pandas.ExcelWriter(fname, engine='xlsxwriter') data_frame.to_excel(writer, sheet_name=sheet, engine='xlsxwriter') writer.save() writer.close() def write_time_series_to_excel_writer(self, writer, sheet, data_frame): """Writes Pandas data frame to disk in Excel format for a writer Parameters ---------- writer : ExcelWriter File handle to use for writing Excel file to disk sheet : str sheet in excel data_frame : DataFrame data frame to be written """ data_frame.to_excel(writer, sheet, engine='xlsxwriter') def read_excel_data_frame(self, f_name, excel_sheet, freq, cutoff=None, dateparse=None, postfix='.close', intraday_tz='UTC'): """Reads Excel from disk into DataFrame Parameters ---------- f_name : str Excel file path to read freq : str Frequency of data to read (intraday/daily etc) cutoff : DateTime (optional) end date to read up to dateparse : str (optional) date parser to use postfix : str (optional) postfix to add to each columns intraday_tz : str timezone of file if uses intraday data Returns ------- DataFrame """ return self.read_csv_data_frame(f_name, freq, cutoff=cutoff, dateparse=dateparse, postfix=postfix, intraday_tz=intraday_tz, excel_sheet=excel_sheet) def remove_time_series_cache_on_disk(self, fname, engine='hdf5_fixed', db_server='127.0.0.1', db_port='6379', timeout=10, username=None, password=None): if 'hdf5' in engine: engine = 'hdf5' if (engine == 'bcolz'): # convert invalid characters to substitutes (which Bcolz can't deal with) pass elif (engine == 'redis'): import redis fname = os.path.basename(fname).replace('.', '_') try: r = redis.StrictRedis(host=db_server, port=db_port, db=0, socket_timeout=timeout, socket_connect_timeout=timeout) if (fname == 'flush_all_keys'): r.flushall() else: # allow deletion of keys by pattern matching if "*" in fname: x = r.keys(fname) if len(x) > 0: r.delete(x) r.delete(fname) except Exception as e: self.logger.warning("Cannot delete non-existent key " + fname + " in Redis: " + str(e)) elif (engine == 'arctic'): from arctic import Arctic import pymongo socketTimeoutMS = 30 * 1000 fname = os.path.basename(fname).replace('.', '_') self.logger.info('Load MongoDB library: ' + fname) if username is not None and password is not None: c = pymongo.MongoClient( host="mongodb://" + username + ":" + password + "@" + str(db_server) + ":" + str(db_port), connect=False) # , username=username, password=password) else: c = pymongo.MongoClient(host="mongodb://" + str(db_server) + ":" + str(db_port), connect=False) store = Arctic(c, socketTimeoutMS=socketTimeoutMS, serverSelectionTimeoutMS=socketTimeoutMS, connectTimeoutMS=socketTimeoutMS) store.delete_library(fname) c.close() self.logger.info("Deleted MongoDB library: " + fname) elif (engine == 'hdf5'): h5_filename = self.get_h5_filename(fname) # delete the old copy try: os.remove(h5_filename) except: pass ### functions to handle HDF5 on disk def write_time_series_cache_to_disk(self, fname, data_frame, engine='hdf5_fixed', append_data=False, db_server=DataConstants().db_server, db_port=DataConstants().db_port, username=None, password=None, filter_out_matching=None, timeout=10): """Writes Pandas data frame to disk as HDF5 format or bcolz format or in Arctic Parmeters --------- fname : str path of file data_frame : DataFrame data frame to be written to disk engine : str 'hdf5_fixed' - use HDF5 fixed format, very quick, but cannot append to this 'hdf5_table' - use HDF5 table format, slower but can append to 'parquet' - use Parquet 'arctic' - use Arctic/MongoDB database 'redis' - use Redis append_data : bool False - write a fresh copy of data on disk each time True - append data to disk db_server : str Database server for arctic (default: '127.0.0.1') timeout : int Number of seconds to do timeout """ # default HDF5 format hdf5_format = 'fixed' if 'hdf5' in engine: hdf5_format = engine.split('_')[1] engine = 'hdf5' if (engine == 'bcolz'): # convert invalid characters to substitutes (which Bcolz can't deal with) data_frame.columns = self.find_replace_chars( data_frame.columns, _invalid_chars, _replace_chars) data_frame.columns = ['A_' + x for x in data_frame.columns] data_frame['DTS_'] = pandas.to_datetime(data_frame.index, unit='ns') bcolzpath = self.get_bcolz_filename(fname) shutil.rmtree(bcolzpath, ignore_errors=True) zlens = bcolz.ctable.fromdataframe(data_frame, rootdir=bcolzpath) elif (engine == 'redis'): import redis fname = os.path.basename(fname).replace('.', '_') try: r = redis.StrictRedis(host=db_server, port=db_port, db=0, socket_timeout=timeout, socket_connect_timeout=timeout) if isinstance(data_frame, pandas.DataFrame): r.set(fname, data_frame.to_msgpack(compress='blosc')) self.logger.debug("Pushed " + fname + " to Redis") except Exception as e: self.logger.warning("Couldn't push " + fname + " to Redis: " + str(e)) elif (engine == 'arctic'): from arctic import Arctic import pymongo socketTimeoutMS = 30 * 1000 fname = os.path.basename(fname).replace('.', '_') self.logger.info('Load Arctic/MongoDB library: ' + fname) if username is not None and password is not None: c = pymongo.MongoClient( host="mongodb://" + username + ":" + password + "@" + str(db_server) + ":" + str(db_port), connect=False) # , username=username, password=password) else: c = pymongo.MongoClient(host="mongodb://" + str(db_server) + ":" + str(db_port), connect=False) store = Arctic(c, socketTimeoutMS=socketTimeoutMS, serverSelectionTimeoutMS=socketTimeoutMS, connectTimeoutMS=socketTimeoutMS) database = None try: database = store[fname] except: pass if database is None: store.initialize_library(fname, audit=False) self.logger.info("Created MongoDB library: " + fname) else: self.logger.info("Got MongoDB library: " + fname) # Access the library library = store[fname] if ('intraday' in fname): data_frame = data_frame.astype('float32') if filter_out_matching is not None: cols = data_frame.columns new_cols = [] for col in cols: if filter_out_matching not in col: new_cols.append(col) data_frame = data_frame[new_cols] # problems with Arctic when writing timezone to disk sometimes, so strip data_frame = data_frame.copy().tz_localize(None) # can duplicate values if we have existing dates if append_data: library.append(fname, data_frame) else: library.write(fname, data_frame) c.close() self.logger.info("Written MongoDB library: " + fname) elif (engine == 'hdf5'): h5_filename = self.get_h5_filename(fname) # append data only works for HDF5 stored as tables (but this is much slower than fixed format) # removes duplicated entries at the end if append_data: store = pandas.HDFStore(h5_filename, format=hdf5_format, complib="blosc", complevel=9) if ('intraday' in fname): data_frame = data_frame.astype('float32') # get last row which matches and remove everything after that (because append # function doesn't check for duplicated rows nrows = len(store['data'].index) last_point = data_frame.index[-1] i = nrows - 1 while (i > 0): read_index = store.select('data', start=i, stop=nrows).index[0] if (read_index <= last_point): break i = i - 1 # remove rows at the end, which are duplicates of the incoming time series store.remove(key='data', start=i, stop=nrows) store.put(key='data', value=data_frame, format=hdf5_format, append=True) store.close() else: h5_filename_temp = self.get_h5_filename(fname + ".temp") # delete the old copy try: os.remove(h5_filename_temp) except: pass store = pandas.HDFStore(h5_filename_temp, format=hdf5_format, complib="blosc", complevel=9) if ('intraday' in fname): data_frame = data_frame.astype('float32') store.put(key='data', value=data_frame, format=hdf5_format) store.close() # delete the old copy try: os.remove(h5_filename) except: pass # once written to disk rename os.rename(h5_filename_temp, h5_filename) self.logger.info("Written HDF5: " + fname) elif (engine == 'parquet'): if fname[-5:] != '.gzip': fname = fname + '.gzip' data_frame.to_parquet(fname, compression='gzip') self.logger.info("Written Parquet: " + fname) def get_h5_filename(self, fname): """Strips h5 off filename returning first portion of filename Parameters ---------- fname : str h5 filename to strip Returns ------- str """ if fname[-3:] == '.h5': return fname return fname + ".h5" def get_bcolz_filename(self, fname): """Strips bcolz off filename returning first portion of filename Parameters ---------- fname : str bcolz filename to strip Returns ------- str """ if fname[-6:] == '.bcolz': return fname return fname + ".bcolz" def write_r_compatible_hdf_dataframe(self, data_frame, fname, fields=None): """Write a DataFrame to disk in as an R compatible HDF5 file. Parameters ---------- data_frame : DataFrame data frame to be written fname : str file path to be written fields : list(str) columns to be written """ fname_r = self.get_h5_filename(fname) self.logger.info("About to dump R binary HDF5 - " + fname_r) data_frame32 = data_frame.astype('float32') if fields is None: fields = data_frame32.columns.values # decompose date/time into individual fields (easier to pick up in R) data_frame32['Year'] = data_frame.index.year data_frame32['Month'] = data_frame.index.month data_frame32['Day'] = data_frame.index.day data_frame32['Hour'] = data_frame.index.hour data_frame32['Minute'] = data_frame.index.minute data_frame32['Second'] = data_frame.index.second data_frame32['Millisecond'] = data_frame.index.microsecond / 1000 data_frame32 = data_frame32[[ 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'Millisecond' ] + fields] cols = data_frame32.columns store_export = pandas.HDFStore(fname_r) store_export.put('df_for_r', data_frame32, data_columns=cols) store_export.close() def read_time_series_cache_from_disk(self, fname, engine='hdf5', start_date=None, finish_date=None, db_server=DataConstants().db_server, db_port=DataConstants().db_port, username=None, password=None): """Reads time series cache from disk in either HDF5 or bcolz Parameters ---------- fname : str (or list) file to be read from engine : str (optional) 'hd5' - reads HDF5 files (default) 'arctic' - reads from Arctic/MongoDB database 'bcolz' = reads from bcolz file (not fully implemented) start_date : str/datetime (optional) Start date finish_date : str/datetime (optional) Finish data db_server : str IP address of MongdDB (default '127.0.0.1') Returns ------- DataFrame """ logger = LoggerManager.getLogger(__name__) data_frame_list = [] if not (isinstance(fname, list)): if '*' in fname: fname = glob.glob(fname) else: fname = [fname] for fname_single in fname: logger.debug("Reading " + fname_single + "..") if (engine == 'bcolz'): try: name = self.get_bcolz_filename(fname_single) zlens = bcolz.open(rootdir=name) data_frame = zlens.todataframe() data_frame.index = pandas.DatetimeIndex(data_frame['DTS_']) data_frame.index.name = 'Date' del data_frame['DTS_'] # convert invalid characters (which Bcolz can't deal with) to more readable characters for pandas data_frame.columns = self.find_replace_chars( data_frame.columns, _replace_chars, _invalid_chars) data_frame.columns = [x[2:] for x in data_frame.columns] except: data_frame = None elif (engine == 'redis'): import redis fname_single = os.path.basename(fname_single).replace('.', '_') msg = None try: r = redis.StrictRedis(host=db_server, port=db_port, db=0) msg = r.get(fname_single) except: self.logger.info("Cache not existent for " + fname_single + " in Redis") if msg is None: data_frame = None else: self.logger.info('Load Redis cache: ' + fname_single) data_frame = pandas.read_msgpack(msg) elif (engine == 'arctic'): socketTimeoutMS = 2 * 1000 import pymongo from arctic import Arctic fname_single = os.path.basename(fname_single).replace('.', '_') self.logger.info('Load Arctic/MongoDB library: ' + fname_single) if username is not None and password is not None: c = pymongo.MongoClient( host="mongodb://" + username + ":" + password + "@" + str(db_server) + ":" + str(db_port), connect=False ) # , username=username, password=password) else: c = pymongo.MongoClient(host="mongodb://" + str(db_server) + ":" + str(db_port), connect=False) store = Arctic(c, socketTimeoutMS=socketTimeoutMS, serverSelectionTimeoutMS=socketTimeoutMS) # Access the library try: library = store[fname_single] if start_date is None and finish_date is None: item = library.read(fname_single) else: from arctic.date import DateRange item = library.read( fname_single, date_range=DateRange( start_date.replace(tzinfo=None), finish_date.replace(tzinfo=None))) c.close() self.logger.info('Read ' + fname_single) data_frame = item.data except Exception as e: self.logger.warning( 'Library may not exist or another error: ' + fname_single + ' & message is ' + str(e)) data_frame = None elif os.path.isfile(self.get_h5_filename(fname_single)): store = pandas.HDFStore(self.get_h5_filename(fname_single)) data_frame = store.select("data") if ('intraday' in fname_single): data_frame = data_frame.astype('float32') store.close() elif os.path.isfile(fname_single): data_frame = pandas.read_parquet(fname_single) data_frame_list.append(data_frame) if len(data_frame_list) == 1: return data_frame_list[0] return data_frame_list ### functions for CSV reading and writing def write_time_series_to_csv(self, csv_path, data_frame): data_frame.to_csv(csv_path) def read_csv_data_frame(self, f_name, freq, cutoff=None, dateparse=None, postfix='.close', intraday_tz='UTC', excel_sheet=None): """Reads CSV/Excel from disk into DataFrame Parameters ---------- f_name : str CSV/Excel file path to read freq : str Frequency of data to read (intraday/daily etc) cutoff : DateTime (optional) end date to read up to dateparse : str (optional) date parser to use postfix : str (optional) postfix to add to each columns intraday_tz : str (optional) timezone of file if uses intraday data excel_sheet : str (optional) Excel sheet to be read Returns ------- DataFrame """ if (freq == 'intraday'): if dateparse is None: dateparse = lambda x: datetime.datetime(*map( int, [x[6:10], x[3:5], x[0:2], x[11:13], x[14:16], x[17:19]])) elif dateparse is 'dukascopy': dateparse = lambda x: datetime.datetime(*map( int, [x[0:4], x[5:7], x[8:10], x[11:13], x[14:16], x[17:19]])) elif dateparse is 'c': # use C library for parsing dates, several hundred times quicker # requires compilation of library to install import ciso8601 dateparse = lambda x: ciso8601.parse_datetime(x) if excel_sheet is None: data_frame = pandas.read_csv(f_name, index_col=0, parse_dates=True, date_parser=dateparse) else: data_frame = pandas.read_excel(f_name, excel_sheet, index_col=0, na_values=['NA']) data_frame = data_frame.astype('float32') data_frame.index.names = ['Date'] old_cols = data_frame.columns new_cols = [] # add '.close' to each column name for col in old_cols: new_cols.append(col + postfix) data_frame.columns = new_cols else: # daily data if 'events' in f_name: data_frame = pandas.read_csv(f_name) # very slow conversion data_frame = data_frame.convert_objects(convert_dates='coerce') else: if excel_sheet is None: try: data_frame = pandas.read_csv(f_name, index_col=0, parse_dates=["DATE"], date_parser=dateparse) except: data_frame = pandas.read_csv(f_name, index_col=0, parse_dates=["Date"], date_parser=dateparse) else: data_frame = pandas.read_excel(f_name, excel_sheet, index_col=0, na_values=['NA']) # convert Date to Python datetime # datetime data_frame['Date1'] = data_frame.index # slower method: lambda x: pandas.datetime.strptime(x, '%d/%m/%Y %H:%M:%S') # data_frame['Date1'].apply(lambda x: datetime.datetime(int(x[6:10]), int(x[3:5]), int(x[0:2]), # int(x[12:13]), int(x[15:16]), int(x[18:19]))) # data_frame.index = data_frame['Date1'] # data_frame.drop('Date1') # slower method: data_frame.index = pandas.to_datetime(data_frame.index) if (freq == 'intraday'): # assume time series are already in UTC and assign this (can specify other time zones) data_frame = data_frame.tz_localize(intraday_tz) # end cutoff date if cutoff is not None: if (isinstance(cutoff, str)): cutoff = parse(cutoff) data_frame = data_frame.loc[data_frame.index < cutoff] return data_frame def find_replace_chars(self, array, to_find, replace_with): for i in range(0, len(to_find)): array = [x.replace(to_find[i], replace_with[i]) for x in array] return array def convert_csv_data_frame(self, f_name, category, freq, cutoff=None, dateparse=None): """Converts CSV file to HDF5 file Parameters ---------- f_name : str File name to be read category : str data category of file (used in HDF5 filename) freq : str intraday/daily frequency (used in HDF5 filename) cutoff : DateTime (optional) filter dates up to here dateparse : str date parser to use """ self.logger.info("About to read... " + f_name) data_frame = self.read_csv_data_frame(f_name, freq, cutoff=cutoff, dateparse=dateparse) category_f_name = self.create_cache_file_name(category) self.write_time_series_cache_to_disk(category_f_name, data_frame) def clean_csv_file(self, f_name): """Cleans up CSV file (removing empty characters) before writing back to disk Parameters ---------- f_name : str CSV file to be cleaned """ with codecs.open(f_name, 'rb', 'utf-8') as myfile: data = myfile.read() # clean file first if dirty if data.count('\x00'): self.logger.info('Cleaning CSV...') with codecs.open(f_name + '.tmp', 'w', 'utf-8') as of: of.write(data.replace('\x00', '')) shutil.move(f_name + '.tmp', f_name) def create_cache_file_name(self, filename): return DataConstants().folder_time_series_data + "/" + filename # TODO refactor IOEngine so that each database is implemented in a subclass of DBEngine def get_engine(self, engine='hdf5_fixed'): pass