def __init__(self, G, *args, **kwargs): """ The DataTransform MDB module was written as a decorator class which should be applied to the load_data function of a Biothings Uploader. The load_data function yields documents, which are then post processed by call and the 'id' key conversion is performed. :param G: nx.DiGraph (networkx 2.1) configuration graph :param input_types: A list of input types for the form (identifier, field) where identifier matches a node and field is an optional dotstring field for where the identifier should be read from (the default is ‘_id’). :param output_types: A priority list of identifiers to convert to. These identifiers should match nodes in the graph. :type output_types: list(str) :param skip_on_failure: If True, documents where identifier conversion fails will be skipped in the final document list. :type skip_on_failure: bool :param skip_w_regex: Do not perform conversion if the identifier matches the regular expression provided to this argument. By default, this option is disabled. :type skip_w_regex: bool :param idstruct_class: Override an internal data structure used by the this module (advanced usage) :type idstruct_class: class :param copy_from_doc: If true then an identifier is copied from the input source document regardless as to weather it matches an edge or not. (advanced usage) :type copy_from_doc: bool """ if not isinstance(G, nx.DiGraph): raise ValueError( "key_lookup configuration error: G must be of type nx.DiGraph" ) self._validate_graph(G) self.G = G self.logger, _ = get_logger('datatransform') super().__init__(*args, **kwargs) self._precompute_paths()
def __init__(self, managers, version_urls, indexer_factory=None, *args, **kwargs): """ version_urls is a list of URLs pointing to versions.json file. The name of the data release is taken from the URL (http://...s3.amazon.com/<the_name>/versions.json) unless specified as a dict: {"name" : "custom_name", "url" : "http://..."} If indexer_factory is passed, it'll be used to create indexer used to dump/check versions currently installed on ES, restore snapshot, index, etc... A indexer_factory is typically used to generate indexer dynamically (ES host, index name, etc...) according to URLs for instance. See standalone.hub.DynamicIndexerFactory class for an example. It is typically used when lots of data releases are being managed by the Hub (so no need to manually update STANDALONE_CONFIG parameter. If indexer_factory is None, a config param named STANDALONE_CONFIG is used, format is the following: {"_default" : {"es_host": "...", "index": "...", "doc_type" : "..."}, "the_name" : {"es_host": "...", "index": "...", "doc_type" : "..."}} When a data release named (from URL) matches an entry, it's used to configured which ES backend to target, otherwise the default one is used. """ super().__init__(*args, **kwargs) self.version_urls = self.extract(version_urls) self.indexer_factory = indexer_factory self.managers = managers self.logger, _ = get_logger("autohub")
def merger_worker(col_name, dest_name, ids, mapper, upsert, batch_num): try: src = mongo.get_src_db() tgt = mongo.get_target_db() col = src[col_name] #if batch_num == 2: # raise ValueError("oula pa bon") dest = DocMongoBackend(tgt, tgt[dest_name]) cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) mapper.load() docs = mapper.process(cur) cnt = dest.update(docs, upsert=upsert) return cnt except Exception as e: logger_name = "build_%s_%s_batch_%s" % (dest_name, col_name, batch_num) logger = get_logger(logger_name, btconfig.LOG_FOLDER) logger.exception(e) exc_fn = os.path.join(btconfig.LOG_FOLDER, "%s.pick" % logger_name) pickle.dump(e, open(exc_fn, "wb")) logger.info("Exception was dumped in pickle file '%s'" % exc_fn) raise
def indexer_worker(col_name, ids, pindexer, batch_num, mode="index", worker=new_index_worker): try: if mode in ["index", "merge"]: return worker(col_name, ids, pindexer, batch_num) elif mode == "resume": idxr = pindexer() es_ids = idxr.mexists(ids) missing_ids = [e[0] for e in es_ids if e[1] is False] if missing_ids: return worker(col_name, missing_ids, pindexer, batch_num) else: # fake indexer results, it has to be a tuple, first elem is num of indexed docs return (0, None) except Exception as e: logger_name = "index_%s_%s_batch_%s" % (pindexer.keywords.get( "index", "index"), col_name, batch_num) logger, _ = get_logger(logger_name, btconfig.LOG_FOLDER) logger.exception("indexer_worker failed") exc_fn = os.path.join(btconfig.LOG_FOLDER, "%s.pick" % logger_name) pickle.dump({"exc": e, "ids": ids}, open(exc_fn, "wb")) logger.info("Exception and IDs were dumped in pickle file '%s'", exc_fn) raise
def __init__(self, input_types, output_types, id_priority_list=[], skip_on_failure=False, skip_w_regex=None, skip_on_success=False, idstruct_class=IDStruct, copy_from_doc=False, debug=False): # pylint: disable=R0913, W0102 """ Initialize the keylookup object and precompute paths from the start key to all target keys. The decorator is intended to be applied to the load_data function of an uploader. The load_data function yields documents, which are then post processed by call and the 'id' key conversion is performed. :param G: nx.DiGraph (networkx 2.1) configuration graph :param collections: list of mongodb collection names :param input_type: key type to start key lookup from :param output_types: list of all output types to convert to :param id_priority_list: A priority list of identifiers to to sort input and output types by. :type id_priority_list: list(str) :param id_struct_class: IDStruct used to manager/fetch IDs from docs :param copy_from_doc: if transform failed using the graph, try to get value from the document itself when output_type == input_type. No check is performed, it's a straight copy. If checks are needed (eg. check that an ID referenced in the doc actually exists in another collection, nodes with self-loops can be used, so ID resolution will be forced to go through these loops to ensure data exists) """ self.input_types = self._parse_input_types(input_types) self.output_types = self._parse_output_types(output_types) self.id_priority_list = id_priority_list self.skip_on_failure = skip_on_failure self.skip_on_success = skip_on_success if skip_w_regex and not isinstance(skip_w_regex, str): raise ValueError('skip_w_regex must be a string') elif not skip_w_regex: self.skip_w_regex = None else: self.skip_w_regex = re.compile(skip_w_regex) self.idstruct_class = idstruct_class self.copy_from_doc = copy_from_doc self.histogram = Histogram() # Setup logger and logging level self.logger, _ = get_logger('datatransform') self.debug = debug
def __init__(self, src_name, data_folder, load_ensembl2entrez=True): self.data_folder = data_folder self.ensembl2entrez_li = None self.ensembl_main = None if load_ensembl2entrez: self._load_ensembl2entrez_li(src_name) self.ensembl2entrez = list2dict( self.ensembl2entrez_li, 0, alwayslist=True) self.logger, self.logfile = get_logger("parse_%s" % src_name)
def __init__(self, index_name, snapshot_name, env_conf, build_doc): self.index_name = index_name self.snapshot_name = snapshot_name self.build_doc = build_doc self.env_conf = env_conf self.logger, self.logfile = get_logger(SNAPSHOOTER_CATEGORY, btconfig.LOG_FOLDER)
def __init__(self, input_types, output_types, *args, **kwargs): """ Initialize the IDLookupAPI object. """ self._generate_return_fields() super(DataTransformAPI, self).__init__(input_types, output_types, *args, **kwargs) # default value of None for client self.client = None # Keep track of one_to_many relationships self.one_to_many_cnt = 0 self.logger, _ = get_logger('keylookup_api')
def upload_worker(name, storage_class, loaddata_func, col_name, batch_size, batch_num, *args): """ Pickable job launcher, typically running from multiprocessing. storage_class will instanciate with col_name, the destination collection name. loaddata_func is the parsing/loading function, called with `*args`. """ data = [] try: data = loaddata_func(*args) if type(storage_class) is tuple: klass_name = "_".join( [k.__class__.__name__ for k in storage_class]) storage = type(klass_name, storage_class, {})(None, col_name, loggingmod) else: storage = storage_class(None, col_name, loggingmod) return storage.process(data, batch_size) except Exception as e: logger_name = "%s_batch_%s" % (name, batch_num) logger, logfile = get_logger(logger_name, config.LOG_FOLDER) logger.exception(e) logger.error("Parameters:\nname=%s\nstorage_class=%s\n" % (name, storage_class) + "loaddata_func=%s\ncol_name=%s\nbatch_size=%s\n" % (loaddata_func, col_name, batch_size) + "args=%s" % repr(args)) import pickle pickfile = os.path.join(os.path.dirname(logfile), "%s.pick" % logger_name) try: pickle.dump( { "exc": e, "params": { "name": name, "storage_class": storage_class }, "loaddata_func": loaddata_func, "col_name": col_name, "batch_size": batch_size, "args": args }, open(pickfile, "wb")) except TypeError as ie: logger.warning("Could not pickle batch errors: %s" % ie) raise e
def upload_worker(name, storage_class, loaddata_func, col_name, batch_size, batch_num, *args): """ Pickable job launcher, typically running from multiprocessing. storage_class will instanciate with col_name, the destination collection name. loaddata_func is the parsing/loading function, called with *args """ try: data = loaddata_func(*args) storage = storage_class(None,col_name,loggingmod) return storage.process(data,batch_size) except Exception as e: logger_name = "%s_batch_%s" % (name,batch_num) logger = get_logger(logger_name, config.LOG_FOLDER) logger.exception(e) raise
def __init__(self, source_list, features=None, name="BioThings Hub", managers_custom_args={}, api_config=None, reloader_config=None, dataupload_config=None, websocket_config=None): """ Helper to setup and instantiate common managers usually used in a hub (eg. dumper manager, uploader manager, etc...) "source_list" is either: - a list of string corresponding to paths to datasources modules - a package containing sub-folders with datasources modules Specific managers can be retrieved adjusting "features" parameter, where each feature corresponds to one or more managers. Parameter defaults to all possible available. Managers are configured/init in the same order as the list, so if a manager (eg. job_manager) is required by all others, it must be the first in the list. "managers_custom_args" is an optional dict used to pass specific arguments while init managers: managers_custom_args={"upload" : {"poll_schedule" : "*/5 * * * *"}} will set poll schedule to check upload every 5min (instead of default 10s) "reloader_config", "dataupload_config" and "websocket_config" can be used to customize reloader, dataupload and websocket. If None, default config is used. If explicitely False, feature is deactivated. """ self.name = name self.source_list = source_list self.logger, self.logfile = get_logger("hub") self._passed_features = features self._passed_managers_custom_args = managers_custom_args self.features = self.clean_features(features or self.DEFAULT_FEATURES) self.managers_custom_args = managers_custom_args self.reloader_config = reloader_config or self.DEFAULT_RELOADER_CONFIG self.dataupload_config = dataupload_config or self.DEFAULT_DATAUPLOAD_CONFIG self.websocket_config = websocket_config or self.DEFAULT_WEBSOCKET_CONFIG self.ws_listeners = [] # collect listeners that should be connected (push data through) to websocket self.api_config = api_config or self.DEFAULT_API_CONFIG # set during configure() self.managers = None self.api_endpoints = None self.shell = None self.commands = None self.extra_commands = None self.routes = [] # flag "do we need to configure?" self.configured = False
def __init__(self, build_doc, indexer_env, index_name): # build_doc primarily describes the source. # indexer_env primarily describes the destination. _build_doc = _BuildDoc(build_doc) _build_backend = _build_doc.parse_backend() # ----------source---------- self.mongo_client_args = _build_backend.args self.mongo_database_name = _build_backend.dbs self.mongo_collection_name = _build_backend.col # -----------dest----------- # [1] https://elasticsearch-py.readthedocs.io/en/v7.12.0/api.html#elasticsearch.Elasticsearch # [2] https://elasticsearch-py.readthedocs.io/en/v7.12.0/helpers.html#elasticsearch.helpers.bulk self.es_client_args = indexer_env.get("args", {}) # See [1] for available args self.es_blkidx_args = indexer_env.get("bulk", {}) # See [2] for available args self.es_index_name = index_name or _build_doc.build_name self.es_index_settings = IndexSettings( deepcopy(DEFAULT_INDEX_SETTINGS)) self.es_index_mappings = IndexMappings( deepcopy(DEFAULT_INDEX_MAPPINGS)) _build_doc.enrich_settings(self.es_index_settings) _build_doc.enrich_mappings(self.es_index_mappings) # -----------info----------- self.env_name = indexer_env.get("name") self.conf_name = _build_doc.build_config.get("name") self.build_name = _build_doc.build_name self.logger, self.logfile = get_logger('index_%s' % self.es_index_name) self.pinfo = ProcessInfo(self, indexer_env.get("concurrency", 10))
def __init__(self, *args, **kwargs): """ An example of config dict for this module. { "indexer_select": { None: "hub.dataindex.indexer.DrugIndexer", # default "build_config.cold_collection" : "mv.ColdHotVariantIndexer", }, "env": { "prod": { "host": "localhost:9200", "indexer": { "args": { "timeout": 300, "retry_on_timeout": True, "max_retries": 10, }, "bulk": { "chunk_size": 50 "raise_on_exception": False }, "concurrency": 3 }, "index": [ # for information only, only used in index_info {"index": "mydrugs_current", "doc_type": "drug"}, {"index": "mygene_current", "doc_type": "gene"} ], }, "dev": { ... } } } """ super().__init__(*args, **kwargs) self._srcbuild = get_src_build() self._config = {} self.logger, self.logfile = get_logger('indexmanager')
def setup_log(self): self.logger, self.logfile = get_logger("syncmanager")
def setup_log(self): return get_logger('upload_%s' % self.fullname)
def setup_log(self): self.logger, self.logfile = get_logger('sync')
def setup_log(self): self.logger, _ = get_logger('apimanager')
def setup_log(self): self.logger,_ = get_logger('keylookup')
def setup_log(self): """Setup and return a logger instance""" self.logger, self.logfile = get_logger('assistant_%s' % self.__class__.plugin_type)
def setup_log(self): """Setup and return a logger instance""" self.logger, self.logfile = get_logger('inspect')
def setup_log(self): self.logger, self.logfile = get_logger(SNAPSHOOTER_CATEGORY, self.log_folder)
def setup_log(self): """Setup and return a logger instance""" self.logger, self.logfile = get_logger('assistantmanager')
def setup_log(self): """Setup and return a logger instance""" self.logger, self.logfile = get_logger('loader_%s' % self.plugin_name)
def setup_log(self): self.logger, self.logfile = get_logger('indexmanager', self.log_folder)
def setup_log(self): self.logger, self.logfile = get_logger('index_%s' % self.index_name, self.log_folder)
def setup_log(self): """setup the logger member variable""" self.logger, _ = get_logger('datatransform')
def _ensure_logger(logger): if not logger: return logging.getLogger(__name__) if isinstance(logger, str): return get_logger(logger)[0] return logger
import copy import re from biothings.hub.datatransform.datatransform import DataTransform from networkx import all_simple_paths, nx import biothings.utils.mongo as mongo from biothings.utils.loggers import get_logger from biothings import config as btconfig from biothings import config_for_app # Configuration of collections from biothings config file config_for_app(btconfig) # Setup logger and logging level kl_log = get_logger('keylookup', btconfig.LOG_FOLDER) class DataTransformSerial(DataTransform): # Constants DEFAULT_WEIGHT = 1 default_source = '_id' def __init__(self, G, collections, input_types, output_types, skip_on_failure=False, skip_w_regex=None): """ Initialize the keylookup object and precompute paths from the
def setup_log(self): self.logger, self.logfile = get_logger("dump_%s" % self.src_name)