def resolve_field_value_template(tree, data, data_index): object_paths = as_list(resolve_path(tree, ['start', 'object_path'])) for object_path in object_paths: ids = as_list(resolve_path(data, get_field_path(object_path))) if not ids: return None data = remove_none(list(map(lambda id: data_index.get(id), ids))) value_paths = as_list(resolve_path(tree, ['start', 'value_path'])) if len(value_paths) == 1: return resolve_path(data, get_field_path(value_paths[0])) else: new_value = [] for data in as_list(data): field_values = remove_empty( map( lambda value_path: as_list( resolve_path(data, get_field_path(value_path))) or None, value_paths)) product = itertools.product(*field_values) joined = map( lambda field_value: reduce( lambda acc, s: s if s.startswith(acc) else acc + " " + s, field_value, ""), product) if joined: new_value.extend(joined) return list(distinct(remove_empty(new_value)))
def link_object(dest_entity_name, dest_object, src_object_id): dest_object_ref = dest_entity_name + 'DbIds' dest_object_ids = dest_object.get(dest_object_ref) or set() if not isinstance(dest_object_ids, set): dest_object_ids = set(dest_object_ids) dest_object_ids.add(src_object_id) dest_object[dest_object_ref] = remove_empty(dest_object_ids)
def transform_parse_uri(source: dict, entities: dict, entity_line: Tuple[str, str]) -> List[dict]: """ Parse JSON, get or generate ID, get or generate URI """ entity, line = entity_line data = json.loads(line) output = [] def get_or_generate_uri(source, entity, data): data_id = get_identifier(entity, data) data_uri = get_generate_uri(source, entity, data) return {'@type': entity, '@id': data_uri, 'schema:identifier': data_id} # Extract internal objects (if any) internal_object_links = filter(lambda l: l['type'] == 'internal-object', get_in(entities, [entity, 'links']) or []) for link in internal_object_links: link_entity = link['entity'] link_path = remove_empty(link['json-path'].split('.')) link_values = get_in(data, link_path) for link_value in as_list(link_values): # Output internal object output.append(get_or_generate_uri(source, link_entity, link_value)) # Output current data object output.append(get_or_generate_uri(source, entity, data)) return output
def fetch_all_links(source, logger, entities): """ Link objects across entities. - Internal: link an object (ex: study) to another using an identifier inside the JSON object (ex: link a location via study.locationDbId) - Internal object: link an object (ex: study) to another contained inside the first (ex: link a location via study.location.locationDbId) - External object: link an object (ex: study) to another using a dedicated call (ex: link to observation variables via /brapi/v1/studies/{id}/observationVariables) """ for (entity_name, entity) in entities.items(): if 'links' not in entity: continue for link in entity['links']: for (object_id, object) in entity['store'].items(): linked_entity_name = link['entity'] linked_entity = entities[linked_entity_name] linked_objects_by_id = {} if link['type'].startswith('internal'): link_path = link['json-path'] link_path_list = remove_empty(link_path.split('.')) link_values = remove_none(as_list(get_in(object, link_path_list))) if not link_values: if link.get('required'): raise BrokenLink("Could not find required field '{}' in {} object id '{}'" .format(link_path, entity_name, object_id)) continue if link['type'] == 'internal-object': for link_value in link_values: link_id = get_identifier(linked_entity_name, link_value) linked_objects_by_id[link_id] = link_value elif link['type'] == 'internal': link_id_field = linked_entity['name'] + 'DbId' link_name_field = linked_entity['name'] + 'Name' for link_value in link_values: link_id = link_value.get(link_id_field) link_name = link_value.get(link_name_field) if link_id: linked_objects_by_id[link_id] = {link_id_field: link_id, link_name_field: link_name} elif link['type'] == 'external-object': call = get_implemented_call(source, link, context=object) if not call: continue link_values = list(BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], call, logger)) for link_value in link_values: link_id = get_identifier(linked_entity_name, link_value) linked_objects_by_id[link_id] = link_value link_objects(entity, object, linked_entity, linked_objects_by_id)
def add(self, data): # Compact object by removing nulls and empty data = remove_empty(data) if data: data['source'] = self.source_id data_id = get_identifier(self.entity_name, data) if data_id in self: dict_merge(self[data_id], data) else: self[data_id] = data
def fetch_all_in_store(entities, fetch_function, arguments, pool): """ Run a fetch function with arguments in a pool worker and collect results in the entity MergeStore """ results = remove_empty(pool.imap_unordered(fetch_function, arguments, 4)) if not results: return for (entity_name, data_list) in results: for data in data_list: entities[entity_name]['store'].add(data)
def resolve_map_template(template, data, data_index): elements = template.get('{map}') transform = template.get('{to}') resolved = as_list(resolve(elements, data, data_index)) if not isinstance(resolved, list): raise Exception("Map can only work on lists.") if not remove_empty(resolved): return None resolved = list( map(lambda value: resolve(transform, value, data_index), resolved)) return resolved
def resolve_field_value_template(tree, data, data_index): object_paths = as_list(get_in(tree, ['start', 'object_path'])) for object_path in object_paths: field_path = get_field_path(object_path) ids = as_list(get_in(data, field_path)) if not ids: return None field = field_path[-1] entity = re.sub(r"(\w+)URIs?", "\\1", field) entity_index = data_index[entity] try: dataList = [] for id in ids: dataList.append(json.loads(entity_index[id].decode())) data = remove_none(dataList) except AttributeError: data = remove_none(list(map(lambda id: entity_index[id], ids))) if getattr(entity_index, 'close', False): entity_index.close() value_paths = as_list(get_in(tree, ['start', 'value_path'])) if len(value_paths) == 1: return get_in(data, get_field_path(value_paths[0])) else: new_value = [] for data in as_list(data): field_values = remove_empty( map( lambda value_path: as_list( get_in(data, get_field_path(value_path))) or None, value_paths)) product = itertools.product(*field_values) joined = map( lambda field_value: reduce( lambda acc, s: s if s.startswith(acc) else acc + " " + s, field_value, ""), product) if joined: new_value.extend(joined) return list(distinct(remove_empty(new_value)))
def fetch_all_in_store(entities, fetch_function, arguments, pool): """ Run a fetch function with arguments in a pool worker and collect results in the entity MergeStore """ source_name = arguments[0][0]['schema:identifier'] results = remove_empty(pool.imap_unordered(fetch_function, arguments, 4)) if not results: return for (entity_name, data_list) in results: for data in data_list: if source_name == 'WUR' and entity_name == 'study': data['startDate'] = data['startDate'] + "-01-01" entities[entity_name]['store'].add(data)
def remove_internal_objects(entities): """ Remove objects referenced inside others (example: trial.studies or study.location) """ for (entity_name, entity) in entities.items(): for link in (entity.get('links') or []): if link['type'] != 'internal-object': continue for (_, data) in entity['store'].items(): link_path = link['json-path'] link_path_list = remove_empty(link_path.split('.')) context_path, last = link_path_list[:-1], link_path_list[-1] link_context = get_in(data, context_path) if link_context and last in link_context: del link_context[last]
def test_remove_empty(self): input_value = [None, [], "", {1, 2, None}, 0, {"a": {"b": {}}}] expected = [{1, 2}, 0] actual = remove_empty(input_value) self.assertEqual(expected, actual)
def transform_uri_link(source: dict, entities: dict, ignore_links, id_index_files: dict, entity_line: Tuple[str, str]) -> dict: """ Transform BrAPI data by adding URI links translated from DbId links and replacing DbIds with encoded URIs. Also checks entity links to make sure every referenced entity exists. """ entity, line = entity_line data = remove_empty(json.loads(line)) data_id = get_identifier(entity, data) data[f"{entity}DbId"] = str(data_id) data_uri = get_generate_uri(source, entity, data) data[f"{entity}URI"] = data_uri # Add basic JSON-LD fields (store URI as @id) data['@type'] = entity data['@id'] = data_uri # Add basic schema.org fields data['schema:includedInDataCatalog'] = source['@id'] data['schema:identifier'] = data_id data['schema:name'] = data.get('schema:name') or data.get(entity + 'Name') # Create URI links for each DbId link id_links = get_entity_links(data, 'DbId') for linked_entity, link_path, link_value in id_links: if linked_entity in ignore_links: continue plural = 's' if is_collection(link_value) else '' link_uri_field = f"{linked_entity}URI{plural}" link_uri_path = [*link_path[:-1], link_uri_field] alias = None if linked_entity not in id_index_files: # Try to find an alias for the linked entity (ex: parent1 in pedigree is a germplasm) aliases = map( lambda l: l['entity-alias'], filter( # Find a link for current entity lambda l: l['entity'] == linked_entity and 'entity-alias' in l, # In entity links get_in(entities, [data['@type'], 'links']) or [])) alias = next(aliases, None) # Linked entity index by Id try: id_index_file = id_index_files[alias or linked_entity] except KeyError as e: raise MissingDataLink( f"No '{alias or linked_entity}' data available to verify '{link_path}' data link " f"in JSON object:\n" f"{data}\n" f"If you want to ignore the '{alias or linked_entity}' data links add it to the 'ignore-links' " f"config option.\n" f"If you want to extract the '{alias or linked_entity}' from '{data['@type']}', add an " f"'internal-object' link in the 'config/extract-brapi/entities/{data['@type']}' config file.\n" f"If the path '{link_path}' corresponds to another type of entity, add an 'internal' link" f"with a 'entity-alias' in the 'config/extract-brapi/entities/{data['@type']}' config file." ) from e # open read only uri_index = UnQLite(id_index_file, flags=UNQLITE_OPEN_READONLY | UNQLITE_OPEN_MMAP) def get_in_index(link_id): try: return uri_index[link_id].decode() except KeyError as e: raise MissingDataLink( f"Could not find '{alias or linked_entity}' with id '{link_id}' " f"found in '{link_path}' of object:\n{data}") from e if plural: link_uri = list(map(get_in_index, link_value)) else: link_uri = get_in_index(link_value) update_in(data, link_uri_path, link_uri) def encode_uri(uri): return base64.b64encode(str(uri).encode()).decode() # Replace DbId with b64 encoded URI uri_links = get_entity_links(data, 'URI') for linked_entity, link_path, link_value in uri_links: if linked_entity in ignore_links: continue plural = 's' if is_collection(link_value) else '' link_id_field = f"{linked_entity}DbId{plural}" link_id_path = [*link_path[:-1], link_id_field] if plural: link_id = list(map(encode_uri, link_value)) else: link_id = encode_uri(link_value) update_in(data, link_id_path, link_id) return data
def launch_etl(options, config): def handler(*_): sys.exit(0) signal.signal(signal.SIGINT, handler) default_index_template = config['load-elasticsearch']['index-template'] # Execute ETL actions based on CLI arguments: if 'extract' in options or 'etl_es' in options or 'etl_virtuoso' in options: etl.extract.brapi.main(config) if 'transform_elasticsearch' in options or 'etl_es' in options: transform_config = config['transform-elasticsearch'] # Restrict lis of generated document if requested input_doc_types = options.get('document_types') if input_doc_types: transform_config['restricted-documents'] = set( remove_empty(input_doc_types.split(','))) # Copy base jsonschema definitions into each document jsonschema validation_config = transform_config['validation'] base_definitions = validation_config['base-definitions'] for (document_type, document_schema) in validation_config['documents'].items(): document_schema['definitions'] = base_definitions # Run transform etl.transform.elasticsearch.main(config) if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options: # Replace JSON-LD context path with absolute path for (entity_name, entity) in config['transform-jsonld']['entities'].items(): if '@context' in entity: entity['@context'] = get_file_path( [config['conf-dir'], entity['@context']]) if not os.path.exists(entity['@context']): raise Exception( 'JSON-LD context file "{}" defined in "{}" does not exist' .format( entity['@context'], os.path.join(config['conf-dir'], 'transform-jsonld.json'))) # Replace JSON-LD model path with an absolute path config['transform-jsonld']['model'] = get_file_path( [config['conf-dir'], config['transform-jsonld']['model']]) etl.transform.jsonld.main(config) if 'transform_rdf' in options or 'etl_virtuoso' in options: etl.transform.rdf.main(config) if 'load_elasticsearch' in options or 'etl_es' in options: mapping_files = list_entity_files( os.path.join(config['conf-dir'], 'elasticsearch')) selected_document_types = None if 'document_types' in options and options['document_types']: selected_document_types = set(options['document_types'].split(',')) config['load-elasticsearch']['url'] = '{}:{}'.format( options['host'], options['port']) config['load-elasticsearch']['mappings'] = { document_type: file_path for document_type, file_path in mapping_files } config['load-elasticsearch']['index-template'] = options.get( 'index_template') or default_index_template config['load-elasticsearch'][ 'document-types'] = selected_document_types etl.load.elasticsearch.main(config) if 'load_virtuoso' in options or 'etl_virtuoso' in options: etl.load.virtuoso.main(config)
def extend_config(config, options): """ Extend the configuration with the options provided in CLI arguments """ config['options'] = options # Data output dir config['data-dir'] = get_folder_path( [options.get('data_dir') or config['default-data-dir']], create=True) # Sources config config['sources'] = dict() source_id_field = 'schema:identifier' for source_file in (options.get('sources') or list()): source_config = json.loads(source_file.read()) if source_id_field not in source_config: raise Exception( "No field '{}' in data source JSON configuration file '{}'". format(source_id_field, source_file.name)) identifier = source_config[source_id_field] if identifier in config['sources']: raise Exception( "Source id '{}' found twice in source list: {}\n" "Please verify the '{}' field in your files.".format( identifier, options['sources'], source_id_field)) config['sources'][identifier] = source_config if 'transform_elasticsearch' in options or 'etl_es' in options: transform_config = config['transform-elasticsearch'] transform_config['documents'] = list( transform_config['documents'].values()) # Restrict lis of generated document if requested input_doc_types = options.get('document_types') if input_doc_types: transform_config['restricted-documents'] = set( remove_empty(input_doc_types.split(','))) # Copy base jsonschema definitions into each document jsonschema validation_schemas = transform_config['validation-schemas'] base_definitions = validation_schemas['base-definitions'] for (document_type, document_schema) in validation_schemas.items(): if document_schema != base_definitions: document_schema['definitions'] = base_definitions if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options: # Replace JSON-LD context path with absolute path for (entity_name, entity) in config['transform-jsonld']['entities'].items(): if '@context' in entity: entity['@context'] = get_file_path( [config['conf-dir'], entity['@context']]) if not os.path.exists(entity['@context']): raise Exception( 'JSON-LD context file "{}" defined in "{}" does not exist' .format( entity['@context'], os.path.join(config['conf-dir'], 'transform-jsonld.json'))) # Replace JSON-LD model path with an absolute path config['transform-jsonld']['model'] = get_file_path( [config['conf-dir'], config['transform-jsonld']['model']]) if 'load_elasticsearch' in options or 'etl_es' in options: load_elasticsearch = config['load-elasticsearch'] # CLI selected list of document types selected_document_types = None if 'document_types' in options and options['document_types']: selected_document_types = set(options['document_types'].split(',')) load_elasticsearch['document-types'] = selected_document_types elasticsearch_config = load_elasticsearch['config'] load_elasticsearch['index-template'] = options.get( 'index_template') or elasticsearch_config['index-template'] load_elasticsearch['url'] = '{}:{}'.format( options['host'] or elasticsearch_config['host'], options['port'] or elasticsearch_config['port']) return config