class ArcticBinary: def __init__(self, lib_name: str = _ARCTIC_BINARY_LIBRARY, mongo_db: str = "auto"): """假定一个 instance 只操作一个 library mongo_db : "auto" 根据环境是否为 colab 自动选择 google 还是 local "google" 选择 google 的 mongo "intranet" 选择机房中的Mongo """ # 这里 暂时先 hardcode arctic 所使用的 mongo 地址 mongo_db_conn_str = get_mongo_admin_conn_str() if mongo_db == "google": mongo_db_conn_str = get_google_mongo_conn_str() elif mongo_db == "intranet": mongo_db_conn_str = get_intranet_mongo_conn_str() self._store = Arctic(mongo_db_conn_str) if not self._store.library_exists(lib_name): self._store.initialize_library(lib_name, VERSION_STORE) self._lib = self._store[lib_name] def write_bin_object(self, bin_data: bytes, symbol: str): self._lib.write(symbol, bin_data) def read_bin_object(self, symbol: str) -> bytes: return self._lib.read(symbol).data def has_symbol(self, symbol: str) -> bytes: return self._lib.has_symbol(symbol)
def _get_lib(lib_name: "lib name(str)" = "default", lib_type: "lib type" = VERSION_STORE): client = MongoClient(host=config.MONGO_HOST, port=27017, username=config.MONGO_USER, password=config.MONGO_PWD, authSource=config.MONGO_AUTHDB) a = Arctic(client) if not a.library_exists(lib_name): a.initialize_library(lib_name, lib_type=lib_type) return a[lib_name]
def update_pool_cmd(ctx, pool_id, start, end, libname, mongo_uri): config = Config() ctx.obj["start"] = start ctx.obj["end"] = end start, end = datetime.datetime.strptime(start, "%Y%m%d"), \ datetime.datetime.strptime(end, "%Y%m%d") # for dumping usuage pool = ctx.obj["pools"][pool_id]() logger.info(f"Proccessing Pool: {pool.__class__.__name__.lower()}") symbols = pool.variables.keys() # if load we load from arctic mongodb, then filling the missing ones try: store = Arctic(mongo_uri) logger.info(f"Connecting mongodb from: URI={mongo_uri}.") # Here needs to be improved # for async to since evert symbol may have different length data_ = {} if store.library_exists(libname): lib = store.get_library(libname) start_ = start for sym in symbols: try: d = lib.read(sym).data data_[sym] = d if d.index[ -1] > start_: # check db start is gt requested start start_ = d.index[-1] except arctic.exceptions.NoDataFoundException as e: logger.info(e) start = start_ # update from start, in older to overide the start date data point] except pymongo.errors.ServerSelectionTimeoutError as e: click.echo(str(e)) data = pool.get_batch(symbols, start, end) # merge data by replacing if data_: for sym, d in data.items(): d = pd.merge(data_[sym], d) # this will replace with new data data[sym] = d # dataset for symb, var in pool.variables.items(): logger.info(f"symbol: {symb}") yield var, data[symb]
def poke(self, context): hook = MongoHook(self.mongo_conn_id, libname=self.libname) client = hook.get_conn() store = Arctic(client) self.log.info( f'Poking for {self.mongo_conn_id}, {self.libname}: {self.symbol}') try: if store.library_exists(self.libname): lib = store.get_library(self.libname) if lib.has_symbol(self.symbol): return self.python_call_back( self.meta, lib.read_meta(self.symbol).metadata) except OSError: return False return False
class DB(): def __init__( self, mongod=r'E:\mongodb-win32-x86_64-2008plus-ssl-4.0.6\bin\mongod.exe', dbPath=r'E:\mongodb-win32-x86_64-2008plus-ssl-4.0.6\bin\data', address='127.0.0.1'): self.mongod = mongod self.dbPath = dbPath self.address = address def startDB(self, storename='fx'): self.con = subprocess.Popen("%s %s %s" % (self.mongod, "--dbpath", self.dbPath), shell=True) self.store = Arctic(self.address) if not self.store.library_exists('fx'): self.store.initialize_library(storename) self.library = self.store[storename] self.rc = self.con.returncode def readFxData(self, name='EURUSD', version=1, start='2016-07-01', end='2016-07-02'): return self.library.read(name, as_of=version, date_range=DateRange(start, end)) def writeData2DB(self, df, name): self.library.write(name, df) def __del__(self): print("terminating db connection.") self.con.terminate()
class ArcticChunkStorage: """ 有关 Arctic 的存储方式: TS 方式 存储的数据: - lib_name 对应于一个数据的调用接口 - symbol 对应于一个股票 - chunk_size 一般取 M(对于 daily 的数据) CS 方式 存储的数据: - lib_name 考虑 Per Data Vendor一个,或者几个接口合用同一个 lib_name - symbol 数据接口的名称 - chunk_size 取 D,一个 chunk 一期数据(这里一般不会一次取出多期数据,本地磁盘也会有pkl的缓存) """ META_KEY_MTIME = "mtime" META_KEY_MAX_T_INSTORE = "max_t" def __init__(self): self.arctic_store = Arctic(get_mongo_admin_conn_str(), connectTimeoutMS=600 * 1000, serverSelectionTimeoutMS=600 * 1000) # NOTE: arctic_store 已经有 library 的 cache 机制 def check_library(self, lib_name: str): if not self.arctic_store.library_exists(lib_name): self.arctic_store.initialize_library(lib_name, lib_type=CHUNK_STORE) def is_symbol_exist(self, lib_name: str, symbol: str) -> bool: if not self.arctic_store.library_exists(lib_name): return False lib_chunk_store = self.arctic_store[lib_name] return lib_chunk_store.has_symbol(symbol) def is_cs_date_exist(self, lib_name: str, api_name: str, date_v: date) -> bool: if not self.is_symbol_exist(lib_name, api_name): return False df = self.arctic_store[lib_name].read(api_name, chunk_range=pd.date_range( date_v, date_v)) return _get_df_rows_count(df) > 0 def init_write_chunk_lib(self, lib_name: str, chunk_size: str, symbol: str, df: pd.DataFrame): if not self.arctic_store.library_exists(lib_name): self.arctic_store.initialize_library(lib_name, lib_type=CHUNK_STORE) lib_chunk_store = self.arctic_store[lib_name] if lib_chunk_store.has_symbol(symbol): lib_chunk_store.delete(symbol) run_start = time.time() # 仅写入有数据的dataframe,如果 dataframe没有一行数据,则只写入 meta 的内容 lib_chunk_store.write(symbol, df, chunk_size=chunk_size, upsert=True) logger.debug( f"Init write {lib_name}-{symbol} arctic , used {time.time() - run_start} secs, {_get_df_rows_count(df)} rows " ) max_date_in_db = df.index.max() lib_chunk_store.write_metadata( symbol, { self.META_KEY_MTIME: datetime.now(), self.META_KEY_MAX_T_INSTORE: max_date_in_db }) def _write_cs_chunk_lib(self, lib_name: str, api_name: str, df: pd.DataFrame): if not self.arctic_store.library_exists(lib_name): self.arctic_store.initialize_library(lib_name, lib_type=CHUNK_STORE) lib_chunk_store = self.arctic_store[lib_name] run_start = time.time() if not lib_chunk_store.has_symbol(api_name): lib_chunk_store.write(api_name, df, chunk_size="D", upsert=True) else: lib_chunk_store.update(api_name, df, upsert=True) logger.debug( f"Init write {lib_name}-{api_name} arctic , used {time.time() - run_start} secs, {_get_df_rows_count(df)} rows " ) lib_chunk_store.write_metadata(api_name, {self.META_KEY_MTIME: datetime.now()}) def _append_write_chunk_lib(self, lib_name: str, symbol: str, df: pd.DataFrame): lib_chunk_store = self.arctic_store[lib_name] min_date = df.index.min() df_last_chunk = next(lib_chunk_store.reverse_iterator( symbol)) # 读取出最后一个 chunk 的数据,需要一同进行更新 max_date_in_db = df_last_chunk.index.max() if max_date_in_db >= min_date: err_msg = f"Can't append data {lib_name}-{symbol} already existed in db. max_db_t:{max_date_in_db} , min_data_t:{min_date}." \ f"Maybe another one has updated ts data!" logger.error(err_msg) # NOTE : 这里不再抛出异常,有可能同时会有多个进程同时在更新arctic 上的同一个数据,会引起数据已经进行过了更新 # raise RuntimeError(err_msg) # meta 还是更新,避免下次还会被 update lib_chunk_store.write_metadata( symbol, { self.META_KEY_MTIME: datetime.now(), self.META_KEY_MAX_T_INSTORE: max_date_in_db }) return # 叠加最后一个 chunk 的数据 df = df.append(df_last_chunk) df.sort_index(axis=0, ascending=True, inplace=True) run_start = time.time() lib_chunk_store.update(symbol, df, upsert=True) logger.debug( f"Upsert {lib_name}-{symbol} , used {time.time() - run_start} secs, {_get_df_rows_count(df)} rows " ) max_date_in_db = df.index.max() lib_chunk_store.write_metadata( symbol, { self.META_KEY_MTIME: datetime.now(), self.META_KEY_MAX_T_INSTORE: max_date_in_db }) def _read_all(self, lib_name: str, symbol: str) -> pd.DataFrame: lib_chunk_store = self.arctic_store[lib_name] if not lib_chunk_store.has_symbol(symbol): return None run_start = time.time() df = lib_chunk_store.read(symbol) logger.debug( f"Read {lib_name}-{symbol} arctic , used {time.time() - run_start} secs, {_get_df_rows_count(df)} rows " ) return df def _read_period( self, lib_name: str, symbol: str, start_t: date = date(1990, 1, 1), end_t: date = date(2050, 12, 31) ) -> pd.DataFrame: lib_chunk_store = self.arctic_store[lib_name] run_start = time.time() df = lib_chunk_store.read(symbol, chunk_range=pd.date_range(start_t, end_t), filter_data=True) logger.debug( f"Read {lib_name}-{symbol} period [{start_t}-{end_t}] , used {time.time() - run_start} secs , {_get_df_rows_count(df)} rows " ) return df def _read_cs(self, lib_name: str, api_name: str, t: date) -> pd.DataFrame: if not self.is_symbol_exist(lib_name, api_name): return None lib_chunk_store = self.arctic_store[lib_name] run_start = time.time() df = lib_chunk_store.read(api_name, chunk_range=pd.date_range(t, t), filter_data=True) logger.debug( f"Read {lib_name}-{api_name} date {t} , used {time.time() - run_start} secs , {_get_df_rows_count(df)} rows " ) return df def _read_meta(self, lib_name: str, symbol: str) -> Dict[str, Any]: lib_chunk_store = self.arctic_store[lib_name] try: meta_data = lib_chunk_store.read_metadata(symbol) return meta_data except NoDataFoundException: return None def _write_meta(self, lib_name: str, symbol: str, meta: Dict[str, Any]): lib_chunk_store = self.arctic_store[lib_name] lib_chunk_store.write_metadata(symbol, meta) def _remove_symbol(self, lib_name: str, symbol: str): if not self.arctic_store.library_exists(lib_name): return lib_chunk_store = self.arctic_store[lib_name] lib_chunk_store.delete(symbol) def ts_upsert_arctic_storage(self, lib_name: str, symbol: str, df: pd.DataFrame, chunk_size: str = "M", force_reinit: bool = False): """一些 derived ts 数据, arctic 只负责存储这些衍生的数据内容""" assert df is not None and df.shape[0] > 0 # 不允许写入一个空的 df 对象 if force_reinit: self._remove_symbol(lib_name, symbol) if not self.arctic_store.library_exists(lib_name): self.arctic_store.initialize_library(lib_name, lib_type=CHUNK_STORE) lib_chunk_store = self.arctic_store[lib_name] run_start = time.time() if not lib_chunk_store.has_symbol(symbol): # 第一次写入 lib_chunk_store.write(symbol, df, chunk_size=chunk_size, upsert=True) logger.debug( f"Init write {lib_name}-{symbol} arctic , used {time.time() - run_start} secs, {_get_df_rows_count(df)} rows " ) else: # upsert min_date = df.index.min() df_last_chunk = next(lib_chunk_store.reverse_iterator( symbol)) # 读取出最后一个 chunk 的数据,需要一同进行更新 max_date_in_db = df_last_chunk.index.max() if max_date_in_db >= min_date: err_msg = f"Can't append data {lib_name}-{symbol} already existed in db. max_db_t:{max_date_in_db} , min_data_t:{min_date}." \ f"Maybe another one has updated ts data!" logger.error(err_msg) return # 叠加最后一个 chunk 的数据 df = df.append(df_last_chunk) df.sort_index(axis=0, ascending=True, inplace=True) run_start = time.time() lib_chunk_store.update(symbol, df, upsert=True) logger.debug( f"Upsert {lib_name}-{symbol} , used {time.time() - run_start} secs, {_get_df_rows_count(df)} rows " ) max_date_in_db = df.index.max() lib_chunk_store.write_metadata( symbol, { self.META_KEY_MTIME: datetime.now(), self.META_KEY_MAX_T_INSTORE: max_date_in_db })
class TrailMetricsArcticReporter: NNI_EXPERIMENT_LIB = "NNI_EXPERIMENT_METRICS" COL_FINAL_RESULT = "final_result" INTERMEDIATE_START_DATE = datetime(2000, 1, 1) FINAL_METRICS_DATE = datetime(2010, 1, 1) def __init__(self, experiment_name: str, experiment_uuid: str, trial_uuid: str): set_http_proxy() self.experiment_name = experiment_name self.experiment_uuid = experiment_uuid self.trial_uuid = trial_uuid # 不论程序跑在哪里,都使用 google 上的 arctic 进行沟通 self.arctic_store = Arctic(get_google_mongo_conn_str(), connectTimeoutMS=600 * 1000, serverSelectionTimeoutMS=600 * 1000) if not self.arctic_store.library_exists(TrailMetricsArcticReporter.NNI_EXPERIMENT_LIB): self.arctic_store.initialize_library(TrailMetricsArcticReporter.NNI_EXPERIMENT_LIB, lib_type=CHUNK_STORE) self.arctic_lib = self.arctic_store[TrailMetricsArcticReporter.NNI_EXPERIMENT_LIB] self._curr_write_epoch_id: int = 0 def _write_arctic(self, df: pd.DataFrame): if not self.arctic_lib.has_symbol(self.trial_uuid): self.arctic_lib.write(self.trial_uuid, df, chunk_size="D", upsert=True) self.arctic_lib.write_metadata(self.trial_uuid, {"experiment_name": self.experiment_name, "experiment_uuid": self.experiment_uuid}) else: self.arctic_lib.update(self.trial_uuid, df, upsert=True) def report_intermediate_result(self, epoch: int, metrics: Mapping[str, Any]): df = pd.DataFrame(data=metrics, index=pd.DatetimeIndex( [TrailMetricsArcticReporter.INTERMEDIATE_START_DATE + timedelta(days=epoch)], name="date")) self._write_arctic(df) def report_final_result(self, val: float): df = pd.DataFrame(data={TrailMetricsArcticReporter.COL_FINAL_RESULT: val}, index=pd.DatetimeIndex([TrailMetricsArcticReporter.FINAL_METRICS_DATE], name="date")) self._write_arctic(df) def query_metrics(self, latest_epoch: Optional[int]) -> Tuple[ Optional[int], Optional[List[Dict[str, float]]], Optional[float]]: """ 查询 metrics 内容 Parameters ---------- latest_epoch : int 从 epoch(不包含) 开始增量查询,不填表示从第一期开始查询 Returns ------- latest epoch : optional[int] 最新一个 epoch intermediate metrics : Optional[List[Dict[str,float]]] 从 input latest_epoch 之后的 intermediate metrics 内容 final result : Optional[float] 如果已经得到 final result 则提供该数值 """ if not self.arctic_lib.has_symbol(self.trial_uuid): return None, None, None start_t = TrailMetricsArcticReporter.INTERMEDIATE_START_DATE if latest_epoch is not None: start_t = TrailMetricsArcticReporter.INTERMEDIATE_START_DATE + timedelta(days=latest_epoch + 1) end_t = TrailMetricsArcticReporter.FINAL_METRICS_DATE df = self.arctic_lib.read(self.trial_uuid, chunk_range=pd.date_range(start_t, end_t), filter_data=True) if df is None or df.shape[0] == 0: return latest_epoch, None, None metrics_cols = df.columns.to_list() if TrailMetricsArcticReporter.COL_FINAL_RESULT in metrics_cols: metrics_cols.remove(TrailMetricsArcticReporter.COL_FINAL_RESULT) final_result = None ls_intermediate_metrics = [] rlt_latest_epoch = latest_epoch for row_index, row in df.iterrows(): if row_index < TrailMetricsArcticReporter.FINAL_METRICS_DATE: curr_epoch_id = (row_index - TrailMetricsArcticReporter.INTERMEDIATE_START_DATE).days if rlt_latest_epoch is None or curr_epoch_id > rlt_latest_epoch: rlt_latest_epoch = curr_epoch_id ls_intermediate_metrics.append({k: row[k] for k in metrics_cols}) elif row_index == TrailMetricsArcticReporter.FINAL_METRICS_DATE: final_result = row[TrailMetricsArcticReporter.COL_FINAL_RESULT] else: raise RuntimeError(f"invalid date {row_index} in trail {self.trial_uuid} metrics") return rlt_latest_epoch, ls_intermediate_metrics if len(ls_intermediate_metrics) > 0 else None, final_result
from arctic import Arctic, CHUNK_STORE # pyright: reportMissingImports=false import os import pandas as pd from pymongo import MongoClient import keyring import ssl if __name__ == "__main__": # client = MongoClient("localhost") client = MongoClient(keyring.get_password('atlas', 'connection_string'), ssl_cert_reqs=ssl.CERT_NONE) a = Arctic(client) if a.library_exists('fund'): a.delete_library('fund') if a.library_exists('fund_adj'): a.delete_library('fund_adj') fund = a.initialize_library('fund', CHUNK_STORE) fund_adj = a.initialize_library('fund_adj', CHUNK_STORE) fund = a['fund'] fund_adj = a['fund_adj'] local = Arctic('localhost') fund_local = local['fund'] fund_adj_local = local['fund_adj']
from collections import defaultdict from arctic import Arctic, TICK_STORE from arctic.date import mktz from akira.position_manager.models import Order from faust.livecheck import Case, Signal app = faust.App( "akira-env-position-manager", broker=f"kafka://{os.environ.get('KAFKA_BOOSTRAPHOST', 'localhost:9092')}", origin='position-manager.livecheck') store = Arctic(os.environ.get("MONGODB_URI", "localhost:27017")) libname = os.environ.get("ORDER_LIBNAME", "akira-env.order") if store.library_exists(libname): lib = store[libname] else: lib = store.initialize_library(libname, lib_type=TICK_STORE) execution_topic = app.topic('order-execution', value_type=Order) orders_executed_topic = app.topic('order-executed', value_type=Order) orders_topic = app.topic('orders', value_type=Order) class Position(faust.Record): amount: float = 0 price: float = 0 def __add__(self, order): old = self.amount