def default_fields(obj_data, ws_info, obj_data_v1): """ Produce data for fields that are present in any workspace object document on elasticsearch. """ ws_id = obj_data['info'][6] obj_id = obj_data['info'][0] version = obj_data['info'][4] v1_info = obj_data_v1['info'] is_public = ws_info[6] == 'r' shared_users = get_shared_users(ws_id) copy_ref = obj_data.get('copied') obj_type = obj_data['info'][2] (type_module, type_name, type_version) = get_type_pieces(obj_type) tags = _get_tags(ws_info) return { "creator": obj_data["creator"], "access_group": ws_id, "obj_name": obj_data['info'][1], "shared_users": shared_users, "timestamp": obj_data['epoch'], "creation_date": v1_info[3], "is_public": is_public, "version": version, "obj_id": obj_id, "copied": copy_ref, "tags": tags, "obj_type_version": type_version, "obj_type_module": type_module, "obj_type_name": type_name }
def index_obj(obj_data, ws_info, msg_data): """ For a newly created object, generate the index document for it and push to the elasticsearch topic on Kafka. Args: obj_data - in-memory parsed data from the workspace object msg_data - json event data received from the kafka workspace events stream. Must have keys for `wsid` and `objid` """ obj_type = obj_data['info'][2] (type_module, type_name, type_version) = ws_utils.get_type_pieces(obj_type) if (type_module + '.' + type_name) in _TYPE_BLACKLIST: # Blacklisted type, so we don't index it return # check if this particular object has the tag "noindex" metadata = ws_info[-1] # If the workspace's object metadata contains a "nosearch" tag, skip it if metadata.get('searchtags'): if 'noindex' in metadata['searchtags']: return # Get the info of the first object to get the creation date of the object. upa = get_upa_from_msg_data(msg_data) try: obj_data_v1 = ws_client.admin_req('getObjects', { 'objects': [{ 'ref': upa + '/1' }], 'no_data': 1 }) except WorkspaceResponseError as err: logger.error('Workspace response error:', err.resp_data) raise err obj_data_v1 = obj_data_v1['data'][0] # Dispatch to a specific type handler to produce the search document indexer = _find_indexer(type_module, type_name, type_version) # All indexers are generators that yield document data for ES. defaults = indexer_utils.default_fields(obj_data, ws_info, obj_data_v1) for indexer_ret in indexer(obj_data, ws_info, obj_data_v1): if indexer_ret['_action'] == 'index': if config()['allow_indices'] and indexer_ret.get( 'index') not in config()['allow_indices']: # This index name is not in the indexing whitelist from the config, so we skip logger.debug( f"Index '{indexer_ret['index']}' is not in ALLOW_INDICES, skipping" ) continue if indexer_ret.get('index') in config()['skip_indices']: # This index name is in the indexing blacklist in the config, so we skip logger.debug( f"Index '{indexer_ret['index']}' is in SKIP_INDICES, skipping" ) continue if '_no_defaults' not in indexer_ret: # Inject all default fields into the index document. indexer_ret['doc'].update(defaults) yield indexer_ret
def _init_generic_index(msg): """ Initialize an index from a workspace object indexed by the generic indexer. For example, when the generic indexer gets a type like Module.Type-4.0, then we create an index called "search2.type_0". Message fields: full_type_name - string - eg. "Module.Type-X.Y" """ (_, type_name, type_ver) = get_type_pieces(msg['full_type_name']) index_name = type_name.lower() mappings = {**_GLOBAL_MAPPINGS['ws_auth'], **_GLOBAL_MAPPINGS['ws_object']} _init_index(index_name + '_0', mappings)
def _init_generic_index(msg): """ Initialize an index from a workspace object indexed by the generic indexer. For example, when the generic indexer gets a type like Module.Type-4.0, then we create an index called "search2.type_0". Message fields: full_type_name - string - eg. "Module.Type-X.Y" """ (_, type_name, type_ver) = get_type_pieces(msg['full_type_name']) index_name = type_name.lower() + '_0' mappings = {**_GLOBAL_MAPPINGS['ws_auth'], **_GLOBAL_MAPPINGS['ws_object']} _init_index(index_name, mappings) # Update the 'default_search' alias to include this index _create_alias(_DEFAULT_SEARCH_ALIAS, f"{_PREFIX}.{index_name}")
def index_from_sdk(obj_data, ws_info, obj_data_v1, conf): """Index from an sdk application""" type_module, type_name, type_version = ws_utils.get_type_pieces( obj_data['info'][2]) indexer_app_vars = config()['global']['sdk_indexer_apps'][type_module + '.' + type_name] sdk_app = indexer_app_vars['sdk_app'] sdk_func = indexer_app_vars['sdk_func'] sdk_version = indexer_app_vars.get('sdk_version', None) sub_obj_index = _get_sub_obj_index(indexer_app_vars) workspace_id = obj_data['info'][6] object_id = obj_data['info'][0] index_name_ver = _get_index_name(type_module, type_name, type_version) image = _get_docker_image_name(sdk_app, sdk_version) _pull_docker_image(image) job_dir = _SCRATCH + "/" + str(uuid.uuid1()) os.makedirs(job_dir) _setup_docker_inputs(job_dir, obj_data, ws_info, obj_data_v1, sdk_app, sdk_func) # the volume mount must be relative to the Host, so we add _MOUNT_DIR to job_directory. vols = {_MOUNT_DIR + job_dir: {'bind': _IN_APP_JOB_DIR, 'mode': 'rw'}} env = { 'SDK_CALLBACK_URL': 'not_supported_yet', 'KBASE_ENDPOINT': config()['kbase_endpoint'] } # Run docker container. _DOCKER.containers.run(image, 'async', environment=env, volumes=vols) with open(job_dir + "/output.json") as fd: job_out = json.load(fd) if job_out.get('error'): raise RuntimeError(f"Error from sdk application: {job_out['error']}") job_out = job_out['result'][0] if job_out.get('filepath'): filepath = job_out['filepath'].replace(_IN_APP_JOB_DIR, job_dir, 1) else: raise RuntimeError(f"Unknown sdk application error: {job_out}") return _verify_and_format_output(filepath, job_dir, workspace_id, object_id, index_name_ver, sub_obj_index)
def fn(obj_data, ws_info, obj_data_v1): workspace_id = obj_data['info'][6] object_id = obj_data['info'][0] obj_type = obj_data['info'][2] # Send an event to the elasticsearch_writer to initialize an index for this # type, if it does not exist. yield {'_action': 'init_generic_index', 'full_type_name': obj_type} obj_type_name = ws_utils.get_type_pieces(obj_type)[1] yield { '_action': 'index', 'doc': indexer_utils.default_fields(obj_data, ws_info, obj_data_v1), 'index': obj_type_name.lower() + "_0", 'id': f"WS::{workspace_id}:{object_id}", 'no_defaults': True, # 'namespace': "WS" }
def _save_type_vertices(obj_info): """Save associated vertices for an object type.""" obj_type = sanitize_arangodb_key(obj_info[2]) (type_module, type_name, type_ver) = get_type_pieces(obj_type) (maj_ver, min_ver) = [int(v) for v in type_ver.split('.')] logger.info( f'Saving ws_type_version, ws_type, and ws_type_module for {obj_type}') save( 'ws_type_version', { '_key': obj_type, 'type_name': type_name, 'module_name': type_module, 'maj_ver': maj_ver, 'min_ver': min_ver }) save( 'ws_type', { '_key': f'{type_module}.{type_name}', 'type_name': type_name, 'module_name': type_module }) save('ws_type_module', {'_key': type_module})