def _generate_taxon_edge(obj_ver_key, obj_data): if 'taxon_ref' not in obj_data['data']: logger.info('No taxon ref in object; skipping..') return ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) result = ws_client.admin_req( 'getObjects', {'objects': [{ 'ref': obj_data['data']['taxon_ref'] }]}) taxonomy_id = result['data'][0]['data']['taxonomy_id'] adb_resp = _stored_query('ncbi_fetch_taxon', { 'id': str(taxonomy_id), 'ts': int(time.time() * 1000), }) adb_results = adb_resp['results'] if not adb_results: logger.info(f'No taxonomy node in database for id {taxonomy_id}') return tax_key = adb_results[0]['_key'] # Create an edge from the ws_object_ver to the taxon from_id = f"{_OBJ_VER_COLL}/{obj_ver_key}" to_id = f"{_TAX_VER_COLL}/{tax_key}" logger.info(f'Creating taxon edge from {from_id} to {to_id}') _save(_TAX_EDGE_COLL, [{ '_from': from_id, '_to': to_id, 'assigned_by': '_system' }])
def _produce(data, topic=config()['topics']['admin_events']): """ Produce a new event messagew on a Kafka topic and block at most 60s for it to get published. """ producer = Producer({'bootstrap.servers': config()['kafka_server']}) producer.produce(topic, json.dumps(data), callback=_delivery_report) producer.poll(0.1)
def _reindex_ws_type(args): """ Reindex all objects in the entire workspace server based on a type name. """ if not re.match(r'^.+\..+-\d+\.\d+$', args.type): sys.stderr.write('Enter the full type name, such as "KBaseGenomes.Genome-17.0"') sys.exit(1) # - Iterate over all workspaces # - For each workspace, list objects # - For each obj matching args.type, produce a reindex event ws = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) evtype = 'INDEX_NONEXISTENT' if args.overwrite: evtype = 'REINDEX' for wsid in range(args.start, args.stop + 1): try: infos = ws.admin_req('listObjects', {'ids': [wsid]}) except WorkspaceResponseError as err: print(err.resp_data['error']['message']) continue for obj_info in infos: obj_type = obj_info[2] if obj_type == args.type: _produce({'evtype': evtype, 'wsid': wsid, 'objid': obj_info[0]}) print('..done!')
def save(self, df): log.info("Save as parquet") HdfsUtils(None).write(df=df, path=config("PATH_PARQUET_RESULT"), format=config("PARQUET_FORMAT"), partition_name="dt_partition", save_mode=config("OVERWRITE_MODE"))
def get_sample(sample_info): """ Get sample from SampleService sample_info - dict containing 'id' and 'version' of a sample """ headers = {"Authorization": config()['ws_token']} params = { "id": sample_info['id'], "as_admin": True } if sample_info.get('version'): params['version'] = sample_info['version'] payload = { "method": "SampleService.get_sample", "id": "", # str(uuid.uuid4()), "params": [params], "version": "1.1" } resp = requests.post(url=config()['sample_service_url'], headers=headers, data=json.dumps(payload)) if not resp.ok: raise RuntimeError(f"Returned from sample service with status {resp.status_code} - {resp.text}") resp_json = resp.json() if resp_json.get('error'): raise RuntimeError(f"Error from SampleService - {resp_json['error']}") sample = resp_json['result'][0] return sample
def _reindex_ws_type(args): """ Reindex all objects in the entire workspace server based on a type name. """ if not re.match(r'^.+\..+-\d+\.\d+$', args.type): sys.stderr.write( 'Enter the full type name, such as "KBaseGenomes.Genome-17.0"') sys.exit(1) # - Iterate over all workspaces # - For each workspace, list objects # - For each obj matching args.type, produce a reindex event ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) evtype = 'INDEX_NONEXISTENT' if args.overwrite: evtype = 'REINDEX' for wsid in range(args.start, args.stop + 1): wsid = int(wsid) try: infos = ws_client.generate_obj_infos(wsid, admin=True) for obj_info in infos: obj_type = obj_info[2] if obj_type == args.type: _produce({ 'evtype': evtype, 'wsid': wsid, 'objid': int(obj_info[0]) }) except Exception as err: print(f'Error fetching object infos for workspace {wsid}: {err}') continue print('..done!')
def test_handle_msg_no_objtype(): """Valid test path for filtering by type when no `objtype` field is provided, and we fetch the type from the workspace based on the object reference.""" objtype = "TypeModule.TypeName-1.2" # Mock response mock_resp = { "version": "1.1", "result": [{ "infos": [[ 1, # objid "objname", objtype, "2020-08-11T23:12:28+0000", 57, # version "creator_username", 33192, # workspace id "workspace_name", "checksum", 24500, # bytes {}, ]], "paths": [["33192/1/57"]] }] } responses.add(responses.POST, config()['workspace_url'], json=mock_resp) with set_env(SKIP_TYPES=objtype): config(force_reload=True) res = _handle_msg({'objid': 1, 'wsid': 3000, 'evtype': 'x'}) assert res is None
def fetch_objects_in_workspace(ws_id, include_narrative=False): """ Get a list of dicts with keys 'type' and 'name' corresponding to all data objects in the requested workspace. Args: ws_id - a workspace id """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) try: narr_data_obj_info = ws_client.admin_req("listObjects", {"ids": [ws_id]}) except WorkspaceResponseError as err: logger.error("Workspace response error: ", err.resp_data) raise err if include_narrative: narrative_data = [{ "obj_id": obj[0], "name": obj[1], "obj_type": obj[2], "ver": obj[4] } for obj in narr_data_obj_info] else: narrative_data = [{ "name": obj[1], "obj_type": obj[2] } for obj in narr_data_obj_info if 'KBaseNarrative' not in str(obj[2])] return narrative_data
def produce(data: Any, topic: str = config()['topics']['admin_events'], callback: Callable = None) -> None: """ Produce a new event message on a Kafka topic and block for it to get published. If the produce fails, it will be retried at most _KAFKA_PRODUCE_RETRIES tries (defaults to 5). Args: data: the data to send to Kafka. Must be JSONable. topic: the topic where the data will be sent. callback: a callable provided to the confluent Kafka Producer class. """ producer = Producer({'bootstrap.servers': config()['kafka_server']}) tries = 0 while True: try: producer.produce(topic, json.dumps(data), callback=callback) producer.flush() break except BufferError: if tries == _KAFKA_PRODUCE_RETRIES: raise RuntimeError( "Unable to produce a Kafka message due to BufferError") logger.error( "Received a BufferError trying to produce a message on Kafka. Retrying.." ) tries += 1
def main(): # Set up the logger # Make the urllib debug logs less noisy logging.getLogger("urllib3").setLevel(logging.WARNING) init_logger(logger) # Initialize and run the Kafka consumer topics = [ config()['topics']['workspace_events'], config()['topics']['admin_events'] ] consumer = kafka.init_consumer(topics) atexit.register(lambda signum, stack_frame: kafka.close_consumer(consumer)) signal.signal(signal.SIGTERM, lambda signum, stack_frame: kafka.close_consumer(consumer)) signal.signal(signal.SIGINT, lambda signum, stack_frame: kafka.close_consumer(consumer)) # Run the main thread event_loop.start_loop(consumer, _handle_msg, on_success=_log_msg_to_elastic, on_failure=_log_err_to_es, on_config_update=es_indexer.reload_aliases, logger=logger)
def save(coll_name, docs, on_duplicate='update'): """ Bulk-save documents to the relation engine database API docs: https://github.com/kbase/relation_engine_api Args: coll_name - collection name docs - single dict or list of dicts to save into the collection as json documents on_duplicate - what to do on a unique key collision. One of 'update', 'replace' 'ignore', 'error'. """ if isinstance(docs, dict): docs = [docs] url = config()['re_api_url'] + '/api/v1/documents' # convert the docs into a string, where each obj is separated by a linebreak payload = '\n'.join([json.dumps(d) for d in docs]) params = {'collection': coll_name, 'on_duplicate': on_duplicate} params['display_errors'] = '1' resp = requests.put( url, data=payload, params=params, headers={'Authorization': config()['re_api_token']} ) if not resp.ok: raise RuntimeError(f'Error response from RE API: {resp.text}') return resp.json()
def test_handle_msg_allow_types(): """ Test that an event from a type NOT IN the whitelist results in a no-op in _handle_msg """ with set_env(ALLOW_TYPES='xyz'): config(force_reload=True) res = _handle_msg({'objtype': 'abc', 'evtype': 'x'}) assert res is None
def test_handle_msg_skip_types(): """ Test that an event from a type from the blacklist results in a no-op in _handle_msg """ with set_env(SKIP_TYPES='xyz'): config(force_reload=True) res = _handle_msg({'objtype': 'xyz', 'evtype': 'x'}) assert res is None
def test_sample_set_indexer1(): # Mock the request that checks for an existing sample url = config()['elasticsearch_url'] + '/search2.sample/_doc/SMP::1:1' responses.add(responses.GET, url, json={'found': False}) # Mock the request against the sample service responses.add(responses.POST, config()['sample_service_url'], json=data['sample_service_resp1']) results = indexer(data['obj1'], data['ws_info1'], data['obj1'], conf) for (idx, result) in enumerate(list(results)): assert result == data['expected_result1'][idx]
def test_handle_msg_skip_types2(): """ Test that an event from a type NOT in the blacklist results in _handle_msg trying to handle the message """ with set_env(SKIP_TYPES='xyz'): config(force_reload=True) res = _handle_msg({'objtype': 'abc', 'evtype': 'x'}) assert res is None
def is_workspace_public(ws_id): """ Check if a workspace is public, returning bool. """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) ws_info = ws_client.admin_req('getWorkspaceInfo', {'id': ws_id}) global_read = ws_info[6] return global_read != 'n'
def _setup_docker_inputs(job_dir, obj_data, ws_info, obj_data_v1, sdk_app, sdk_func): """set up parameters for input to the sdk application""" data_dir = job_dir + "/data" os.makedirs(data_dir) scratch_dir = job_dir + "/tmp" # nosec os.mkdir(scratch_dir) # nosec obj_data_path = data_dir + "/obj_data.json" ws_info_path = data_dir + "/ws_info.json" obj_data_v1_path = data_dir + "/obj_data_v1.json" # write data to file with open(obj_data_path, "w") as fd: json.dump(obj_data, fd) with open(ws_info_path, "w") as fd: json.dump(ws_info, fd) with open(obj_data_v1_path, "w") as fd: json.dump(obj_data_v1, fd) # we want to provide the app the path within its context. obj_data_path = _IN_APP_JOB_DIR + "/data/obj_data.json" ws_info_path = _IN_APP_JOB_DIR + "/data/ws_info.json" obj_data_v1_path = _IN_APP_JOB_DIR + "/data/obj_data_v1.json" input_ = { "version": "1.1", "method": sdk_app + "." + sdk_func, "params": [{ 'obj_data_path': obj_data_path, 'ws_info_path': ws_info_path, 'obj_data_v1_path': obj_data_v1_path }], "context": dict() } ijson = job_dir + "/input.json" with open(ijson, "w") as f: f.write(json.dumps(input_)) # write config for sdk application sdk_config = ConfigParser() sdk_config['global'] = { 'kbase_endpoint': config()['kbase_endpoint'], 'workspace_url': config()['workspace_url'], 'scratch': "/kb/module/work/tmp" } with open(job_dir + '/config.properties', 'w') as configfile: sdk_config.write(configfile) # set up token. with open(job_dir + '/token', 'w') as fd: fd.write(_TOKEN)
def index_obj(obj_data, ws_info, msg_data): """ For a newly created object, generate the index document for it and push to the elasticsearch topic on Kafka. Args: obj_data - in-memory parsed data from the workspace object msg_data - json event data received from the kafka workspace events stream. Must have keys for `wsid` and `objid` """ obj_type = obj_data['info'][2] (type_module, type_name, type_version) = ws_utils.get_type_pieces(obj_type) if (type_module + '.' + type_name) in _TYPE_BLACKLIST: # Blacklisted type, so we don't index it return # check if this particular object has the tag "noindex" metadata = ws_info[-1] # If the workspace's object metadata contains a "nosearch" tag, skip it if metadata.get('searchtags'): if 'noindex' in metadata['searchtags']: return # Get the info of the first object to get the creation date of the object. upa = get_upa_from_msg_data(msg_data) try: obj_data_v1 = ws_client.admin_req('getObjects', { 'objects': [{ 'ref': upa + '/1' }], 'no_data': 1 }) except WorkspaceResponseError as err: logger.error('Workspace response error:', err.resp_data) raise err obj_data_v1 = obj_data_v1['data'][0] # Dispatch to a specific type handler to produce the search document indexer = _find_indexer(type_module, type_name, type_version) # All indexers are generators that yield document data for ES. defaults = indexer_utils.default_fields(obj_data, ws_info, obj_data_v1) for indexer_ret in indexer(obj_data, ws_info, obj_data_v1): if indexer_ret['_action'] == 'index': if config()['allow_indices'] and indexer_ret.get( 'index') not in config()['allow_indices']: # This index name is not in the indexing whitelist from the config, so we skip logger.debug( f"Index '{indexer_ret['index']}' is not in ALLOW_INDICES, skipping" ) continue if indexer_ret.get('index') in config()['skip_indices']: # This index name is in the indexing blacklist in the config, so we skip logger.debug( f"Index '{indexer_ret['index']}' is in SKIP_INDICES, skipping" ) continue if '_no_defaults' not in indexer_ret: # Inject all default fields into the index document. indexer_ret['doc'].update(defaults) yield indexer_ret
def test_handle_msg_allow_types2(): """ Test that an event from a type IN the whitelist results in _handle_msg trying to handle the message """ with set_env(ALLOW_TYPES='xyz'): config(force_reload=True) with pytest.raises(RuntimeError) as ctx: _handle_msg({'objtype': 'xyz'}) assert str(ctx.value) == "Missing 'evtype' in event: {'objtype': 'xyz'}"
def main(): """ Run the the Kafka consumer and two threads for the releng_importer and es_indexer """ # Wait for dependency services (ES and RE) to be live wait_for_dependencies(timeout=180) # Used for re-fetching the configuration with a throttle last_updated_minute = int(time.time() / 60) if not config()['global_config_url']: config_tag = _fetch_latest_config_tag() # Database initialization es_indexer.init_indexes() es_indexer.reload_aliases() while True: msg = consumer.poll(timeout=0.5) if msg is None: continue curr_min = int(time.time() / 60) if not config( )['global_config_url'] and curr_min > last_updated_minute: # Check for configuration updates latest_config_tag = _fetch_latest_config_tag() last_updated_minute = curr_min if config_tag is not None and latest_config_tag != config_tag: config(force_reload=True) config_tag = latest_config_tag es_indexer.reload_aliases() if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: logger.info('End of stream.') else: logger.error(f"Kafka message error: {msg.error()}") continue val = msg.value().decode('utf-8') try: msg = json.loads(val) except ValueError as err: logger.error(f'JSON parsing error: {err}') logger.error(f'Message content: {val}') logger.info(f'Received event: {msg}') start = time.time() try: _handle_msg(msg) # Move the offset for our partition consumer.commit() logger.info( f"Handled {msg['evtype']} message in {time.time() - start}s") except Exception as err: logger.error( f'Error processing message: {err.__class__.__name__} {err}') logger.error(traceback.format_exc()) # Save this error and message to a topic in Elasticsearch _log_err_to_es(msg, err=err)
def check_workspace_deleted(ws_id): """ Since the DELETE_WORKSPACE event can correspond to workspace undeletion as well as deletion, we make sure that the workspace is deleted. This is done by making sure we get an excpetion with the word 'delete' in the error body. """ try: config()['ws_client'].admin_req("getWorkspaceInfo", {'id': ws_id}) except WorkspaceResponseError as err: if 'delete' in err.resp_text: return True return False
def _get_sub_obj_index(indexer_app_vars): """Get the name of the sub object index, if applicable, return None otherwise.""" sub_obj_index = indexer_app_vars.get('sub_obj_index', None) if config()['global']['latest_versions'].get(sub_obj_index): sub_obj_index = config()['global']['latest_versions'][sub_obj_index] elif sub_obj_index is None: # here we expect no sub_obj_index, so we move on pass else: raise ValueError( f"No 'latest_versions' field specified for {sub_obj_index} index in global config" ) return sub_obj_index
def set_user_perms(msg): """ Set user permissions for a workspace. Handles the SET_PERMISSION event. This only updates the `shared_users` field for the workspace/narrative. """ wsid = int(msg['wsid']) perms = config()['ws_client'].admin_req('getPermissionsMass', { 'workspaces': [{"id": wsid}] }) shared_users = perms['perms'][0].keys() update = f"ctx._source.shared_users={list(shared_users)}" resp = _update_by_query({'term': {'access_group': wsid}}, update, config()) return resp
def get_doc(coll, key): """Fetch a doc in a collection by key.""" resp = requests.post( config()['re_api_url'] + '/api/v1/query_results', data=json.dumps({ 'query': "for v in @@coll filter v._key == @key limit 1 return v", '@coll': coll, 'key': key }), headers={'Authorization': config()['re_api_token']}) if not resp.ok: raise RuntimeError(resp.text) return resp.json()
def _fetch_objects_in_workspace(ws_id): """ Get a list of dicts with keys 'obj_type' and 'name' corresponding to all data objects in the requested workspace. This discludes the narrative object. Args: ws_id - a workspace id """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) obj_infos = ws_client.generate_obj_infos(ws_id, admin=True) return [{ "name": info[1], "obj_type": info[2] } for info in obj_infos if 'KBaseNarrative' not in str(info[2])]
def clear_collection(collection_name): """ Remove all the documents in a collection without affecting indexes. collection_name - the collection to clear. """ resp = requests.post(config()['re_api_url'] + '/api/v1/query_results', data=json.dumps({ 'query': 'FOR d in @@col REMOVE(d) IN @@col', '@col': collection_name }), headers={'Authorization': config()['re_api_token']}) if not resp.ok: raise RuntimeError(resp.text)
def delete_obj(msg): """ Handle an object deletion event (OBJECT_DELETE_STATE_CHANGE) Delete everything that was created for this object. This is the inverse operation of the import_obj action. """ ws_client = WorkspaceClient(url=config()['kbase_endpoint'], token=config()['ws_token']) obj_ref = f"{msg['wsid']}/{msg['objid']}" if msg.get("ver"): obj_ref += f"/{msg['ver']}" obj_info = ws_client.admin_req('getObjectInfo', { 'objects': [{'ref': obj_ref}] })['infos'][0] delete_object(obj_info)
def init_consumer(topics: List[str]) -> Consumer: """ Initialize a Kafka consumer instance """ consumer = Consumer({ 'bootstrap.servers': config()['kafka_server'], 'group.id': config()['kafka_clientgroup'], 'auto.offset.reset': 'earliest', 'enable.auto.commit': False }) logger.info(f"Subscribing to: {topics}") logger.info(f"Client group: {config()['kafka_clientgroup']}") logger.info(f"Kafka server: {config()['kafka_server']}") consumer.subscribe(topics) return consumer
def get_all_documents(collection_name): """ Returns all the documents in a collection. Using this on a large collection is not advised. collection_name - the collection from which documents will be returned. """ resp = requests.post(config()['re_api_url'] + '/api/v1/query_results', data=json.dumps({ 'query': 'FOR d in @@col RETURN d', '@col': collection_name }), headers={'Authorization': config()['re_api_token']}) if not resp.ok: raise RuntimeError(resp.text) return resp.json()
def wait_for_dependencies(elasticsearch=True, re_api=True, timeout=60): """ Block and wait for elasticsearch and / or the relation engine API. elasticsearch - True (the default) to block on elasticsearch. re_api - True (the default) to block on the relation engine API. timeout - the maximum time to wait for all services to come up. """ start = int(time.time()) if elasticsearch: es_url = config()['elasticsearch_url'] + '/_cluster/health' params = {'wait_for_status': 'yellow', 'timeout': '60s'} _wait_for_service(es_url, 'elasticsearch', start, timeout, params=params) if re_api: _wait_for_service(config()['re_api_url'] + '/', 'relation engine api', start, timeout)