def transform_parse_uri(source: dict, entities: dict, entity_line: Tuple[str, str]) -> List[dict]: """ Parse JSON, get or generate ID, get or generate URI """ entity, line = entity_line data = json.loads(line) output = [] def get_or_generate_uri(source, entity, data): data_id = get_identifier(entity, data) data_uri = get_generate_uri(source, entity, data) return {'@type': entity, '@id': data_uri, 'schema:identifier': data_id} # Extract internal objects (if any) internal_object_links = filter(lambda l: l['type'] == 'internal-object', get_in(entities, [entity, 'links']) or []) for link in internal_object_links: link_entity = link['entity'] link_path = remove_empty(link['json-path'].split('.')) link_values = get_in(data, link_path) for link_value in as_list(link_values): # Output internal object output.append(get_or_generate_uri(source, link_entity, link_value)) # Output current data object output.append(get_or_generate_uri(source, entity, data)) return output
def fetch_details(options): """ Fetch details call for a BrAPI object (ex: /brapi/v1/studies/{id}) """ source, logger, entity, object_id = options if 'detail' not in entity: return detail_call_group = entity['detail'] in_store = object_id in entity['store'] skip_if_in_store = detail_call_group.get('skip-if-in-store') already_detailed = get_in(entity['store'], [object_id, 'etl:detailed']) if in_store and (skip_if_in_store or already_detailed): return entity_name = entity['name'] entity_id = entity_name + 'DbId' detail_call = get_implemented_call(source, detail_call_group, {entity_id: object_id}) if not detail_call: return details = BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], detail_call, logger).__next__() details['etl:detailed'] = True return entity_name, [details]
def test_get_in_simple_path(self): input_path = "a" input_value = {"a": 1} expected = 1 actual = get_in(input_value, input_path) self.assertEqual(expected, actual)
def test_get_in_invalid_simple_path(self): input_path = ["a", "z"] input_value = {"a": {"b": {"c": 1}}} expected = None actual = get_in(input_value, input_path) self.assertEqual(expected, actual)
def test_get_in_heterogeneous_nested_multi_value_path(self): input_path = ["a", "b", "c"] input_value = [{ "a": { "b": { "c": 1 } } }, { "a": [{ "b": { "c": 3 } }] }, { "a": { "b": [{ "c": 4 }, { "c": 5 }] } }] expected = [1, 3, 4, 5] actual = get_in(input_value, input_path) self.assertEqual(expected, actual)
def test_get_in_nested_multi_value_path(self): input_path = ["a", "b", "c"] input_value = [{ "a": { "b": { "c": 1 } } }, { "a": { "b": { "c": 2 } } }, { "a": { "b": { "c": 3 } } }] expected = [1, 2, 3] actual = get_in(input_value, input_path) self.assertEqual(expected, actual)
def test_get_in_nested_path(self): input_path = ["a", "b", "c"] input_value = {"a": {"b": {"c": 1}}} expected = 1 actual = get_in(input_value, input_path) self.assertEqual(expected, actual)
def test_get_in_simple_multi_value_path(self): input_path = "a" input_value = [{"a": 1}, {"a": 2}, {"a": 3}] expected = [1, 2, 3] actual = get_in(input_value, input_path) self.assertEqual(expected, actual)
def fetch_all_links(source, logger, entities): """ Link objects across entities. - Internal: link an object (ex: study) to another using an identifier inside the JSON object (ex: link a location via study.locationDbId) - Internal object: link an object (ex: study) to another contained inside the first (ex: link a location via study.location.locationDbId) - External object: link an object (ex: study) to another using a dedicated call (ex: link to observation variables via /brapi/v1/studies/{id}/observationVariables) """ for (entity_name, entity) in entities.items(): if 'links' not in entity: continue for link in entity['links']: for (object_id, object) in entity['store'].items(): linked_entity_name = link['entity'] linked_entity = entities[linked_entity_name] linked_objects_by_id = {} if link['type'].startswith('internal'): link_path = link['json-path'] link_path_list = remove_empty(link_path.split('.')) link_values = remove_none(as_list(get_in(object, link_path_list))) if not link_values: if link.get('required'): raise BrokenLink("Could not find required field '{}' in {} object id '{}'" .format(link_path, entity_name, object_id)) continue if link['type'] == 'internal-object': for link_value in link_values: link_id = get_identifier(linked_entity_name, link_value) linked_objects_by_id[link_id] = link_value elif link['type'] == 'internal': link_id_field = linked_entity['name'] + 'DbId' link_name_field = linked_entity['name'] + 'Name' for link_value in link_values: link_id = link_value.get(link_id_field) link_name = link_value.get(link_name_field) if link_id: linked_objects_by_id[link_id] = {link_id_field: link_id, link_name_field: link_name} elif link['type'] == 'external-object': call = get_implemented_call(source, link, context=object) if not call: continue link_values = list(BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], call, logger)) for link_value in link_values: link_id = get_identifier(linked_entity_name, link_value) linked_objects_by_id[link_id] = link_value link_objects(entity, object, linked_entity, linked_objects_by_id)
def resolve_field_value_template(tree, data, data_index): object_paths = as_list(get_in(tree, ['start', 'object_path'])) for object_path in object_paths: field_path = get_field_path(object_path) ids = as_list(get_in(data, field_path)) if not ids: return None field = field_path[-1] entity = re.sub(r"(\w+)URIs?", "\\1", field) entity_index = data_index[entity] try: dataList = [] for id in ids: dataList.append(json.loads(entity_index[id].decode())) data = remove_none(dataList) except AttributeError: data = remove_none(list(map(lambda id: entity_index[id], ids))) if getattr(entity_index, 'close', False): entity_index.close() value_paths = as_list(get_in(tree, ['start', 'value_path'])) if len(value_paths) == 1: return get_in(data, get_field_path(value_paths[0])) else: new_value = [] for data in as_list(data): field_values = remove_empty( map( lambda value_path: as_list( get_in(data, get_field_path(value_path))) or None, value_paths)) product = itertools.product(*field_values) joined = map( lambda field_value: reduce( lambda acc, s: s if s.startswith(acc) else acc + " " + s, field_value, ""), product) if joined: new_value.extend(joined) return list(distinct(remove_empty(new_value)))
def remove_internal_objects(entities): """ Remove objects referenced inside others (example: trial.studies or study.location) """ for (entity_name, entity) in entities.items(): for link in (entity.get('links') or []): if link['type'] != 'internal-object': continue for (_, data) in entity['store'].items(): link_path = link['json-path'] link_path_list = remove_empty(link_path.split('.')) context_path, last = link_path_list[:-1], link_path_list[-1] link_context = get_in(data, context_path) if link_context and last in link_context: del link_context[last]
def test_get_in_invalid_multi_path(self): input_path = ["a", "c", "d"] input_value = [{ "a": { "b": { "c": 1 } } }, { "a": { "b": [{ "c": 4 }, { "c": 5 }] } }] expected = None actual = get_in(input_value, input_path) self.assertEqual(expected, actual)
def fetch_details(options): """ Fetch details call for a BrAPI object (ex: /brapi/v1/studies/{id}) """ source, logger, entity, object_id = options if 'detail' not in entity: return detail_call_group = entity['detail'] in_store = object_id in entity['store'] skip_if_in_store = detail_call_group.get('skip-if-in-store') already_detailed = get_in(entity['store'], [object_id, 'etl:detailed']) if in_store and (skip_if_in_store or already_detailed): return entity_name = entity['name'] entity_id = entity_name + 'DbId' detail_call = get_implemented_call(source, detail_call_group, {entity_id: object_id}) if not detail_call: return details = BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], detail_call, logger).__next__() details['etl:detailed'] = True # ----------------------------------------------------------------- # Detect bugy endpoints that returns several studies instead of one. if "expect-single-result" in detail_call_group and detail_call_group[ "expect-single-result"] and 'data' in details and len( details['data']) != 1: logger.debug(f"More than one results for {detail_call}") raise EndPointError(f"More than one results for {detail_call}") if 'data' in details and len(details['data']) == 1: details = details['data'][0] # ----------------------------------------------------------------- return entity_name, [details]
def transform_uri_link(source: dict, entities: dict, ignore_links, id_index_files: dict, entity_line: Tuple[str, str]) -> dict: """ Transform BrAPI data by adding URI links translated from DbId links and replacing DbIds with encoded URIs. Also checks entity links to make sure every referenced entity exists. """ entity, line = entity_line data = remove_empty(json.loads(line)) data_id = get_identifier(entity, data) data[f"{entity}DbId"] = str(data_id) data_uri = get_generate_uri(source, entity, data) data[f"{entity}URI"] = data_uri # Add basic JSON-LD fields (store URI as @id) data['@type'] = entity data['@id'] = data_uri # Add basic schema.org fields data['schema:includedInDataCatalog'] = source['@id'] data['schema:identifier'] = data_id data['schema:name'] = data.get('schema:name') or data.get(entity + 'Name') # Create URI links for each DbId link id_links = get_entity_links(data, 'DbId') for linked_entity, link_path, link_value in id_links: if linked_entity in ignore_links: continue plural = 's' if is_collection(link_value) else '' link_uri_field = f"{linked_entity}URI{plural}" link_uri_path = [*link_path[:-1], link_uri_field] alias = None if linked_entity not in id_index_files: # Try to find an alias for the linked entity (ex: parent1 in pedigree is a germplasm) aliases = map( lambda l: l['entity-alias'], filter( # Find a link for current entity lambda l: l['entity'] == linked_entity and 'entity-alias' in l, # In entity links get_in(entities, [data['@type'], 'links']) or [])) alias = next(aliases, None) # Linked entity index by Id try: id_index_file = id_index_files[alias or linked_entity] except KeyError as e: raise MissingDataLink( f"No '{alias or linked_entity}' data available to verify '{link_path}' data link " f"in JSON object:\n" f"{data}\n" f"If you want to ignore the '{alias or linked_entity}' data links add it to the 'ignore-links' " f"config option.\n" f"If you want to extract the '{alias or linked_entity}' from '{data['@type']}', add an " f"'internal-object' link in the 'config/extract-brapi/entities/{data['@type']}' config file.\n" f"If the path '{link_path}' corresponds to another type of entity, add an 'internal' link" f"with a 'entity-alias' in the 'config/extract-brapi/entities/{data['@type']}' config file." ) from e # open read only uri_index = UnQLite(id_index_file, flags=UNQLITE_OPEN_READONLY | UNQLITE_OPEN_MMAP) def get_in_index(link_id): try: return uri_index[link_id].decode() except KeyError as e: raise MissingDataLink( f"Could not find '{alias or linked_entity}' with id '{link_id}' " f"found in '{link_path}' of object:\n{data}") from e if plural: link_uri = list(map(get_in_index, link_value)) else: link_uri = get_in_index(link_value) update_in(data, link_uri_path, link_uri) def encode_uri(uri): return base64.b64encode(str(uri).encode()).decode() # Replace DbId with b64 encoded URI uri_links = get_entity_links(data, 'URI') for linked_entity, link_path, link_value in uri_links: if linked_entity in ignore_links: continue plural = 's' if is_collection(link_value) else '' link_id_field = f"{linked_entity}DbId{plural}" link_id_path = [*link_path[:-1], link_id_field] if plural: link_id = list(map(encode_uri, link_value)) else: link_id = encode_uri(link_value) update_in(data, link_id_path, link_id) return data
def get_field_path(tree): return get_in(tree, ['field_path', 'FIELD'])