def test_multiprocessing_safety(mongo_host, library_name): # Create/initialize library at the parent process, then spawn children, and start them aligned in time total_processes = 64 total_writes_per_child = 100 register_get_auth_hook(my_auth_hook) global MY_ARCTIC MY_ARCTIC = Arctic(mongo_host=mongo_host) MY_ARCTIC.initialize_library(library_name, VERSION_STORE) assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore) processes = [Process(target=f, args=(library_name, total_writes_per_child, True)) for _ in range(total_processes)] for p in processes: p.start() for p in processes: p.join() for p in processes: assert p.exitcode == 0 assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore)
def test_multiprocessing_safety(mongo_host, library_name): # Create/initialize library at the parent process, then spawn children, and start them aligned in time total_processes = 64 total_writes_per_child = 100 register_get_auth_hook(my_auth_hook) global MY_ARCTIC MY_ARCTIC = Arctic(mongo_host=mongo_host) MY_ARCTIC.initialize_library(library_name, VERSION_STORE) assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore) processes = [ Process(target=f, args=(library_name, total_writes_per_child, True)) for _ in range(total_processes) ] for p in processes: p.start() for p in processes: p.join() for p in processes: assert p.exitcode == 0 assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore)
def test_multiprocessing_safety_parent_children_race(mongo_host, library_name): # Create Arctic and directly fork/start children (no wait) total_iterations = 12 total_processes = 6 total_writes_per_child = 20 global MY_ARCTIC for i in range(total_iterations): processes = list() MY_ARCTIC = Arctic(mongo_host=mongo_host) for j in range(total_processes): p = Process(target=f, args=(library_name, total_writes_per_child, False)) p.start( ) # start directly, don't wait to create first all children procs processes.append(p) MY_ARCTIC.initialize_library( library_name, VERSION_STORE) # this will unblock spinning children for p in processes: p.join() for p in processes: assert p.exitcode == 0 MY_ARCTIC.reset() assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore)
def test_multiprocessing_safety_parent_children_race(mongo_host, library_name): # Create Arctic and directly fork/start children (no wait) total_iterations = 12 total_processes = 6 total_writes_per_child = 20 global MY_ARCTIC for i in range(total_iterations): processes = list() MY_ARCTIC = Arctic(mongo_host=mongo_host) for j in range(total_processes): p = Process(target=f, args=(library_name, total_writes_per_child, False)) p.start() # start directly, don't wait to create first all children procs processes.append(p) MY_ARCTIC.initialize_library(library_name, VERSION_STORE) # this will unblock spinning children for p in processes: p.join() for p in processes: assert p.exitcode == 0 MY_ARCTIC.reset() assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore)
def _arctic_loader(): host = Arctic(arctic_opts['host']) lib = host.get_library(arctic_opts['library']) read_kwargs = {} start, end = map(arctic_opts.get, ['start', 'end']) if start and end: read_kwargs['chunk_range'] = pd.date_range(start, end) data = lib.read(arctic_opts['node'], **read_kwargs) if isinstance(data, VersionedItem): data = data.data return data
def update_pool_cmd(ctx, pool_id, start, end, libname, mongo_uri): config = Config() ctx.obj["start"] = start ctx.obj["end"] = end start, end = datetime.datetime.strptime(start, "%Y%m%d"), \ datetime.datetime.strptime(end, "%Y%m%d") # for dumping usuage pool = ctx.obj["pools"][pool_id]() logger.info(f"Proccessing Pool: {pool.__class__.__name__.lower()}") symbols = pool.variables.keys() # if load we load from arctic mongodb, then filling the missing ones try: store = Arctic(mongo_uri) logger.info(f"Connecting mongodb from: URI={mongo_uri}.") # Here needs to be improved # for async to since evert symbol may have different length data_ = {} if store.library_exists(libname): lib = store.get_library(libname) start_ = start for sym in symbols: try: d = lib.read(sym).data data_[sym] = d if d.index[ -1] > start_: # check db start is gt requested start start_ = d.index[-1] except arctic.exceptions.NoDataFoundException as e: logger.info(e) start = start_ # update from start, in older to overide the start date data point] except pymongo.errors.ServerSelectionTimeoutError as e: click.echo(str(e)) data = pool.get_batch(symbols, start, end) # merge data by replacing if data_: for sym, d in data.items(): d = pd.merge(data_[sym], d) # this will replace with new data data[sym] = d # dataset for symb, var in pool.variables.items(): logger.info(f"symbol: {symb}") yield var, data[symb]
def loader_func(**kwargs): try: from arctic import Arctic from arctic.store.versioned_item import VersionedItem except ImportError: raise ImportError( 'In order to use the arctic loader you must install arctic!') host = Arctic(kwargs.get('host')) lib = host.get_library(kwargs.get('library')) read_kwargs = {} start, end = (kwargs.get(p) for p in ['start', 'end']) if start and end: read_kwargs['chunk_range'] = pd.date_range(start, end) data = lib.read(kwargs.get('node'), **read_kwargs) if isinstance(data, VersionedItem): data = data.data return data
def loader_func(**kwargs): try: from arctic import Arctic from arctic.store.versioned_item import VersionedItem except ImportError: raise ImportError( "In order to use the arctic loader you must install arctic!") host = Arctic(kwargs.get("host")) lib = host.get_library(kwargs.get("library")) read_kwargs = {} start, end = (kwargs.get(p) for p in ["start", "end"]) if start and end: read_kwargs["chunk_range"] = pd.date_range(start, end) data = lib.read(kwargs.get("node"), **read_kwargs) if isinstance(data, VersionedItem): data = data.data return data
def poke(self, context): hook = MongoHook(self.mongo_conn_id, libname=self.libname) client = hook.get_conn() store = Arctic(client) self.log.info( f'Poking for {self.mongo_conn_id}, {self.libname}: {self.symbol}') try: if store.library_exists(self.libname): lib = store.get_library(self.libname) if lib.has_symbol(self.symbol): return self.python_call_back( self.meta, lib.read_meta(self.symbol).metadata) except OSError: return False return False
def _arctic_loader(): try: from arctic import Arctic from arctic.store.versioned_item import VersionedItem except BaseException as ex: logger.exception( 'In order to use the arctic loader you must install ahl.core!' ) raise ex host = Arctic(arctic_opts['host']) lib = host.get_library(arctic_opts['library']) read_kwargs = {} start, end = map(arctic_opts.get, ['start', 'end']) if start and end: read_kwargs['chunk_range'] = pd.date_range(start, end) data = lib.read(arctic_opts['node'], **read_kwargs) if isinstance(data, VersionedItem): data = data.data return data
async def update_basekets(model_id, start, end): from akira.akira_models.basket.utils import get_model # updating model store = Arctic(os.environ.get("MONGODB_URI", "localhost:27017")) lib = store.get_library("akira.tickers") spec = data_spec[model_id] cols = {} for symbol in spec["symbols"]: cols[symbol] = lib.read( symbol, date_range=DateRange( start=start, end=end)) data = pd.concat(cols, axis=1) model_cls_spec = model_spec[model_id] model = get_model(model_id)(**model_cls_spec) model.fit(data) # submit trades return model
class Data(object): def __init__(self, dbname, offline=False): self.libraries = {} self.distributor = Distributer.default() if offline: self.db = OfflineDB() log.critical('WARNING Running in offline mode') return self.db = Arctic(dbname) # initialize databases for field in FIELDS: try: self.libraries[field] = _getLib(self.db, field) except (arctic.exceptions.LibraryNotFoundException, ServerSelectionTimeoutError): log.critical('Arctic not available, is mongo offline??') raise def cache(self, symbols=None, fields=None, delete=False): fields = fields or FIELDS symbols = symbols or p.symbolsDF().index.values.tolist() to_delete, to_fill, to_update = self.initialize(symbols, fields) if delete: # prune data self.delete(to_delete) self.backfill(to_fill) self.update(to_update) self.validate() def delete(self, to_delete=None): # delete data no longer needed for field in to_delete: for symbol in to_delete[field]: log.critical('Deleting %s from %s' % (symbol, field)) self.libraries[field].delete(symbol) def backfill(self, to_fill): # backfill data if necessary for field in to_fill: log.critical('Backfilling %d items' % len(to_fill[field])) lib = self.libraries[field] for symbol, data in whichBackfill(field)(self.distributor, to_fill[field]): log.critical('Filling %s for %s' % (symbol, field)) data_orig = lib.read(symbol).data _appendIfNecessary(lib, symbol, data_orig, data) def update(self, to_update): # update data if necessary for field in to_update: log.critical('Updating %d items' % len(to_update[field])) lib = self.libraries[field] for symbol, data in whichFetch(field)(self.distributor, to_update[field]): log.critical('Updating %s for %s' % (symbol, field)) data_orig = self.libraries[field].read(symbol).data _appendIfNecessary(lib, symbol, data_orig, data) def initialize(self, symbols=None, fields=None): '''setup db''' fields = fields or FIELDS symbols = symbols or p.symbolsDF().index.values.tolist() to_fill = {} to_update = {} to_delete = {} _empty = pd.DataFrame() # initialize database and collect what to update for field in FIELDS: if field not in to_fill: to_fill[field] = [] if field not in to_update: to_update[field] = [] if field not in to_delete: to_delete[field] = [] library = self.libraries[field] all_symbols = library.list_symbols() for symbol in symbols: symbol = symbol.upper() if symbol not in all_symbols: log.critical('Initializing %s for %s' % (symbol, field)) to_fill[field].append(symbol) library.write(symbol, _empty, metadata={'timestamp': never()}) else: metadata = library.read_metadata(symbol.upper()).metadata if not metadata or not metadata.get('timestamp'): to_fill[field].append(symbol) elif metadata.get('timestamp', never()) <= never(): to_fill[field].append(symbol) elif metadata.get('timestamp', never()) < _updateTime(field): to_update[field].append(symbol) for symbol in set(all_symbols) - set(symbols): to_delete[field].append(symbol) return to_delete, to_fill, to_update def validate(self, symbols=None, fields=None): '''look for missing data''' fields = fields or FIELDS symbols = symbols or p.symbolsDF().index.values.tolist() to_refill = {} self.initialize(symbols, fields) for field in FIELDS: tick_start_date = today() daily_start_date = today() fail_count = 0 print_fail = False dates = business_days(last_month(), yesterday()) to_refill[field] = [] if _skip(field): continue dbs = self.db.list_libraries() if field not in dbs: log.critical('VALIDATION FAILED %s' % field) continue lib = self.db.get_library(field) all_symbols = lib.list_symbols() for symbol in symbols: symbol = symbol.upper() # if fail count too high, autofail all for speed if fail_count > .2 * len(all_symbols): if not print_fail: log.critical('VALIDATION THRESHOLD REACHED for %s' % field) print_fail = True if _skip(field, symbol): continue to_refill[field].append(symbol) if field == 'DAILY': daily_start_date = dates[0] if field == 'TICK': tick_start_date = dates[0] continue if _skip(field, symbol): continue if symbol not in all_symbols: to_refill[field].append(symbol) log.critical('VALIDATION FAILED %s for %s' % (symbol, field)) fail_count += 1 continue data = lib.read(symbol).data if data.empty: log.critical('VALIDATION FAILED - DATA EMPTY %s for %s' % (symbol, field)) to_refill[field].append(symbol) fail_count += 1 continue elif field in ('TICK'): for date in dates: if date not in data.index: log.critical( 'VALIDATION FAILED - DATA MISSING %s for %s : %s' % (symbol, field, date.strftime('%Y%m%d'))) to_refill[field].append(symbol) tick_start_date = min( tick_start_date, date) if tick_start_date is not None else date fail_count += 1 break elif field in ('Daily'): for date in dates: if date not in data.index: log.critical( 'VALIDATION FAILED - DATA MISSING %s for %s : %s' % (symbol, field, date.strptime('%Y%m%d'))) to_refill[field].append(symbol) daily_start_date = min( daily_start_date, date) if daily_start_date is not None else date fail_count += 1 break # backfill data if necessary for field in to_refill: lib = self.libraries[field] if field == 'TICK': log.critical( 'Backfilling %d items for %s - %s' % (len(to_refill[field]), field, str(tick_start_date))) elif field == 'DAILY': log.critical( 'Backfilling %d items for %s - %s' % (len(to_refill[field]), field, str(daily_start_date))) else: log.critical('Backfilling %d items for %s' % (len(to_refill[field]), field)) for symbol, data in whichBackfill(field)(self.distributor, to_refill[field], from_=tick_start_date): log.critical('Updating %s for %s' % (symbol, field)) data_orig = lib.read(symbol).data _appendIfNecessary(lib, symbol, data_orig, data) def read(self, symbol, field, fetch=True, fill=False): field = field.upper() symbol = symbol.upper() if field in ('QUOTE'): # dont cache, instantaneous return p.quoteDF(symbol) elif field in ('COMPOSITION'): return refetch(field, symbol) if field not in self.libraries and not fetch: return pd.DataFrame() l = _getLib(self.db, field) if not l.has_symbol(symbol): if not fetch: return pd.DataFrame() df = pd.DataFrame() else: df = l.read(symbol).data metadata = l.read_metadata(symbol).metadata if fetch: if df.empty or not metadata or not metadata.get('timestamp') or \ metadata.get('timestamp', never()) <= never() or \ metadata.get('timestamp', never()) < _updateTime(field): df = refetch(field, symbol) if fill: l.write(symbol, df, metadata={'timestamp': datetime.now()}) return df