def fetch_all_links(source, logger, entities): """ Link objects across entities. - Internal: link an object (ex: study) to another using an identifier inside the JSON object (ex: link a location via study.locationDbId) - Internal object: link an object (ex: study) to another contained inside the first (ex: link a location via study.location.locationDbId) - External object: link an object (ex: study) to another using a dedicated call (ex: link to observation variables via /brapi/v1/studies/{id}/observationVariables) """ for (entity_name, entity) in entities.items(): if 'links' not in entity: continue for link in entity['links']: for (object_id, object) in entity['store'].items(): linked_entity_name = link['entity'] linked_entity = entities[linked_entity_name] linked_objects_by_id = {} if link['type'].startswith('internal'): link_path = link['json-path'] link_path_list = remove_empty(link_path.split('.')) link_values = remove_none(as_list(get_in(object, link_path_list))) if not link_values: if link.get('required'): raise BrokenLink("Could not find required field '{}' in {} object id '{}'" .format(link_path, entity_name, object_id)) continue if link['type'] == 'internal-object': for link_value in link_values: link_id = get_identifier(linked_entity_name, link_value) linked_objects_by_id[link_id] = link_value elif link['type'] == 'internal': link_id_field = linked_entity['name'] + 'DbId' link_name_field = linked_entity['name'] + 'Name' for link_value in link_values: link_id = link_value.get(link_id_field) link_name = link_value.get(link_name_field) if link_id: linked_objects_by_id[link_id] = {link_id_field: link_id, link_name_field: link_name} elif link['type'] == 'external-object': call = get_implemented_call(source, link, context=object) if not call: continue link_values = list(BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], call, logger)) for link_value in link_values: link_id = get_identifier(linked_entity_name, link_value) linked_objects_by_id[link_id] = link_value link_objects(entity, object, linked_entity, linked_objects_by_id)
def test_get_generate_identifier(self): data = {'foo': 'bar', 'baz': 'fizz'} entity = 'buzz' actual = get_identifier(entity, data) self.assertEqual('148068838', actual) # Changing key order should not matter data2 = {'baz': 'fizz', 'foo': 'bar'} actual2 = get_identifier(entity, data2) self.assertEqual(actual, actual2)
def load_all_data_with_uri(source, source_json_dir, transform_config, pool, logger): logger.debug("Loading BrAPI JSON from {}...".format(source_json_dir)) entity_files = list(list_entity_files(source_json_dir)) if transform_config.get('restricted-documents'): document_configs = transform_config['documents'] required_entities = get_required_entities(document_configs, source_json_dir) entity_files = list(filter(compose(required_entities.__contains__, first), entity_files)) logger.debug("Loading entities: {}".format(', '.join(list(map(first, entity_files))))) # Load stream of file lines all_lines = itertools.chain.from_iterable(map(load_entity_lines, entity_files)) # Parse JSON to python objects all_data = pool.imap_unordered(parse_data, all_lines, CHUNK_SIZE) # Generate URIs (and create dict from entity/id to URI) uri_map = dict() data_list = list() for entity_name, data in all_data: data_id, data_uri = generate_uri_global_id(source, entity_name, data) uri_map[(entity_name, data_id)] = data_uri uri_map[(entity_name, get_identifier(entity_name, data))] = data_uri if is_checkpoint(len(data_list)): logger.debug("checkpoint: {} BrAPI objects loaded".format(len(data_list))) data_list.append(data) logger.debug("Loaded total of {} BrAPI objects.".format(len(data_list))) # Replace all entity links using global ids (ex: studyDbId: 1 => studyDbId: urn:source%2Fstudy%2F1) generate_links = partial(generate_global_id_links, source, uri_map) return pool.imap_unordered(generate_links, data_list, CHUNK_SIZE)
def get_generate_uri(source: dict, entity: str, data: dict) -> str: """ Get/Generate URI from BrAPI object or generate one """ pui_field = entity + 'PUI' data_uri = data.get(pui_field) if data_uri and rfc3987.match(data_uri, rule='URI'): # The original PUI is a valid URI return data_uri source_id = urllib.parse.quote(source['schema:identifier']) data_id = get_identifier(entity, data) if not data_uri: # Generate URI from source id, entity name and data id encoded_entity = urllib.parse.quote(entity) encoded_id = urllib.parse.quote(data_id) data_uri = f"urn:{source_id}/{encoded_entity}/{encoded_id}" else: # Generate URI by prepending the original URI with the source identifier encoded_uri = urllib.parse.quote(data_uri) data_uri = f"urn:{source_id}/{encoded_uri}" if not rfc3987.match(data_uri, rule='URI'): raise Exception( f'Could not get or create a correct URI for "{entity}" object id "{data_id}"' f' (malformed URI: "{data_uri}")') return data_uri
def fetch_all_details(source, logger, entities, pool): """ Fetch all details for each object of each entity """ args = list() for (entity_name, entity) in entities.items(): for (_, object) in entity['store'].items(): object_id = get_identifier(entity_name, object) args.append((source, logger, entity, object_id)) fetch_all_in_store(entities, fetch_details, args, pool)
def add(self, data): # Compact object by removing nulls and empty data = remove_empty(data) if data: data['source'] = self.source_id data_id = get_identifier(self.entity_name, data) if data_id in self: dict_merge(self[data_id], data) else: self[data_id] = data
def generate_uri_global_id(source, entity_name, data): data_id = get_identifier(entity_name, data) data_uri = get_uri(source, entity_name, data) data_global_id = uri_encode(data_uri) data['brapi:type'] = entity_name data['source'] = source['@id'] data['@type'] = entity_name data['@id'] = data_uri data[entity_name + 'PUI'] = data_uri data[entity_name + 'DbId'] = data_global_id return data_id, data_uri
def load_file(options): entity_name, json_path = options result = list() with open(json_path, 'r') as json_file: while True: offset = json_file.tell() line = json_file.readline() if not line: break data = json.loads(line) data_id = get_identifier(entity_name, data) data_location = { 'file': json_path, 'offset': offset, 'brapi:type': entity_name } result.append((entity_name, data_id, data_location)) return result
def link_objects(entity, object, linked_entity, linked_objects_by_id): object_id = get_identifier(entity['name'], object) for (link_id, linked_object) in linked_objects_by_id.items(): was_in_store = link_id in linked_entity['store'] if was_in_store: linked_object = linked_entity['store'][link_id] linked_entity_name = linked_entity['name'] if linked_object: link_object(entity['name'], linked_object, object_id) else: raise BrokenLink( f"{linked_entity_name} object id {link_id} not found in store while trying to link with " f"{entity['name']} object id {object_id}") link_object(linked_entity_name, object, link_id) if not was_in_store and linked_object: linked_entity['store'].add(linked_object)
def dump(self, data): entity_name = data['brapi:type'] if entity_name not in self.json_stores: self.json_stores[entity_name] = JSONSplitStore( self.json_dir, entity_name, buffer_size=1, max_file_byte_size=self.max_file_byte_size) json_store = self.json_stores[entity_name] data_id = get_identifier(entity_name, data) file_name = json_store.json_file.name offset = json_store.json_file.tell() json_store.dump(data) json_store.flush() data_location = { 'file': file_name, 'offset': offset, 'brapi:type': entity_name } self._add_location(entity_name, data_id, data_location)
def test_get_identifier(self): data = {'germplasmDbId': 'foo'} entity = 'germplasm' actual = get_identifier(entity, data) self.assertEqual('foo', actual)
def transform_uri_link(source: dict, entities: dict, ignore_links, id_index_files: dict, entity_line: Tuple[str, str]) -> dict: """ Transform BrAPI data by adding URI links translated from DbId links and replacing DbIds with encoded URIs. Also checks entity links to make sure every referenced entity exists. """ entity, line = entity_line data = remove_empty(json.loads(line)) data_id = get_identifier(entity, data) data[f"{entity}DbId"] = str(data_id) data_uri = get_generate_uri(source, entity, data) data[f"{entity}URI"] = data_uri # Add basic JSON-LD fields (store URI as @id) data['@type'] = entity data['@id'] = data_uri # Add basic schema.org fields data['schema:includedInDataCatalog'] = source['@id'] data['schema:identifier'] = data_id data['schema:name'] = data.get('schema:name') or data.get(entity + 'Name') # Create URI links for each DbId link id_links = get_entity_links(data, 'DbId') for linked_entity, link_path, link_value in id_links: if linked_entity in ignore_links: continue plural = 's' if is_collection(link_value) else '' link_uri_field = f"{linked_entity}URI{plural}" link_uri_path = [*link_path[:-1], link_uri_field] alias = None if linked_entity not in id_index_files: # Try to find an alias for the linked entity (ex: parent1 in pedigree is a germplasm) aliases = map( lambda l: l['entity-alias'], filter( # Find a link for current entity lambda l: l['entity'] == linked_entity and 'entity-alias' in l, # In entity links get_in(entities, [data['@type'], 'links']) or [])) alias = next(aliases, None) # Linked entity index by Id try: id_index_file = id_index_files[alias or linked_entity] except KeyError as e: raise MissingDataLink( f"No '{alias or linked_entity}' data available to verify '{link_path}' data link " f"in JSON object:\n" f"{data}\n" f"If you want to ignore the '{alias or linked_entity}' data links add it to the 'ignore-links' " f"config option.\n" f"If you want to extract the '{alias or linked_entity}' from '{data['@type']}', add an " f"'internal-object' link in the 'config/extract-brapi/entities/{data['@type']}' config file.\n" f"If the path '{link_path}' corresponds to another type of entity, add an 'internal' link" f"with a 'entity-alias' in the 'config/extract-brapi/entities/{data['@type']}' config file." ) from e # open read only uri_index = UnQLite(id_index_file, flags=UNQLITE_OPEN_READONLY | UNQLITE_OPEN_MMAP) def get_in_index(link_id): try: return uri_index[link_id].decode() except KeyError as e: raise MissingDataLink( f"Could not find '{alias or linked_entity}' with id '{link_id}' " f"found in '{link_path}' of object:\n{data}") from e if plural: link_uri = list(map(get_in_index, link_value)) else: link_uri = get_in_index(link_value) update_in(data, link_uri_path, link_uri) def encode_uri(uri): return base64.b64encode(str(uri).encode()).decode() # Replace DbId with b64 encoded URI uri_links = get_entity_links(data, 'URI') for linked_entity, link_path, link_value in uri_links: if linked_entity in ignore_links: continue plural = 's' if is_collection(link_value) else '' link_id_field = f"{linked_entity}DbId{plural}" link_id_path = [*link_path[:-1], link_id_field] if plural: link_id = list(map(encode_uri, link_value)) else: link_id = encode_uri(link_value) update_in(data, link_id_path, link_id) return data
def get_or_generate_uri(source, entity, data): data_id = get_identifier(entity, data) data_uri = get_generate_uri(source, entity, data) return {'@type': entity, '@id': data_uri, 'schema:identifier': data_id}