def transform_parse_uri(source: dict, entities: dict,
                        entity_line: Tuple[str, str]) -> List[dict]:
    """
    Parse JSON, get or generate ID, get or generate URI
    """
    entity, line = entity_line
    data = json.loads(line)
    output = []

    def get_or_generate_uri(source, entity, data):
        data_id = get_identifier(entity, data)
        data_uri = get_generate_uri(source, entity, data)
        return {'@type': entity, '@id': data_uri, 'schema:identifier': data_id}

    # Extract internal objects (if any)
    internal_object_links = filter(lambda l: l['type'] == 'internal-object',
                                   get_in(entities, [entity, 'links']) or [])
    for link in internal_object_links:
        link_entity = link['entity']
        link_path = remove_empty(link['json-path'].split('.'))
        link_values = get_in(data, link_path)

        for link_value in as_list(link_values):
            # Output internal object
            output.append(get_or_generate_uri(source, link_entity, link_value))

    # Output current data object
    output.append(get_or_generate_uri(source, entity, data))
    return output
Example #2
0
def fetch_details(options):
    """
    Fetch details call for a BrAPI object (ex: /brapi/v1/studies/{id})
    """
    source, logger, entity, object_id = options
    if 'detail' not in entity:
        return
    detail_call_group = entity['detail']

    in_store = object_id in entity['store']
    skip_if_in_store = detail_call_group.get('skip-if-in-store')
    already_detailed = get_in(entity['store'], [object_id, 'etl:detailed'])
    if in_store and (skip_if_in_store or already_detailed):
        return

    entity_name = entity['name']
    entity_id = entity_name + 'DbId'
    detail_call = get_implemented_call(source, detail_call_group, {entity_id: object_id})

    if not detail_call:
        return

    details = BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], detail_call, logger).__next__()
    details['etl:detailed'] = True
    return entity_name, [details]
    def test_get_in_simple_path(self):
        input_path = "a"
        input_value = {"a": 1}

        expected = 1
        actual = get_in(input_value, input_path)
        self.assertEqual(expected, actual)
    def test_get_in_invalid_simple_path(self):
        input_path = ["a", "z"]
        input_value = {"a": {"b": {"c": 1}}}

        expected = None
        actual = get_in(input_value, input_path)
        self.assertEqual(expected, actual)
    def test_get_in_heterogeneous_nested_multi_value_path(self):
        input_path = ["a", "b", "c"]
        input_value = [{
            "a": {
                "b": {
                    "c": 1
                }
            }
        }, {
            "a": [{
                "b": {
                    "c": 3
                }
            }]
        }, {
            "a": {
                "b": [{
                    "c": 4
                }, {
                    "c": 5
                }]
            }
        }]

        expected = [1, 3, 4, 5]
        actual = get_in(input_value, input_path)
        self.assertEqual(expected, actual)
    def test_get_in_nested_multi_value_path(self):
        input_path = ["a", "b", "c"]
        input_value = [{
            "a": {
                "b": {
                    "c": 1
                }
            }
        }, {
            "a": {
                "b": {
                    "c": 2
                }
            }
        }, {
            "a": {
                "b": {
                    "c": 3
                }
            }
        }]

        expected = [1, 2, 3]
        actual = get_in(input_value, input_path)
        self.assertEqual(expected, actual)
    def test_get_in_nested_path(self):
        input_path = ["a", "b", "c"]
        input_value = {"a": {"b": {"c": 1}}}

        expected = 1
        actual = get_in(input_value, input_path)
        self.assertEqual(expected, actual)
    def test_get_in_simple_multi_value_path(self):
        input_path = "a"
        input_value = [{"a": 1}, {"a": 2}, {"a": 3}]

        expected = [1, 2, 3]
        actual = get_in(input_value, input_path)
        self.assertEqual(expected, actual)
Example #9
0
def fetch_all_links(source, logger, entities):
    """
    Link objects across entities.
     - Internal: link an object (ex: study) to another using an identifier inside the JSON object
      (ex: link a location via study.locationDbId)
     - Internal object: link an object (ex: study) to another contained inside the first
      (ex: link a location via study.location.locationDbId)
     - External object: link an object (ex: study) to another using a dedicated call
      (ex: link to observation variables via /brapi/v1/studies/{id}/observationVariables)
    """
    for (entity_name, entity) in entities.items():
        if 'links' not in entity:
            continue

        for link in entity['links']:
            for (object_id, object) in entity['store'].items():
                linked_entity_name = link['entity']
                linked_entity = entities[linked_entity_name]
                linked_objects_by_id = {}

                if link['type'].startswith('internal'):
                    link_path = link['json-path']
                    link_path_list = remove_empty(link_path.split('.'))

                    link_values = remove_none(as_list(get_in(object, link_path_list)))
                    if not link_values:
                        if link.get('required'):
                            raise BrokenLink("Could not find required field '{}' in {} object id '{}'"
                                             .format(link_path, entity_name, object_id))
                        continue

                    if link['type'] == 'internal-object':
                        for link_value in link_values:
                            link_id = get_identifier(linked_entity_name, link_value)
                            linked_objects_by_id[link_id] = link_value

                    elif link['type'] == 'internal':
                        link_id_field = linked_entity['name'] + 'DbId'
                        link_name_field = linked_entity['name'] + 'Name'
                        for link_value in link_values:
                            link_id = link_value.get(link_id_field)
                            link_name = link_value.get(link_name_field)
                            if link_id:
                                linked_objects_by_id[link_id] = {link_id_field: link_id, link_name_field: link_name}

                elif link['type'] == 'external-object':
                    call = get_implemented_call(source, link, context=object)
                    if not call:
                        continue

                    link_values = list(BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], call, logger))
                    for link_value in link_values:
                        link_id = get_identifier(linked_entity_name, link_value)
                        linked_objects_by_id[link_id] = link_value

                link_objects(entity, object, linked_entity, linked_objects_by_id)
def resolve_field_value_template(tree, data, data_index):
    object_paths = as_list(get_in(tree, ['start', 'object_path']))
    for object_path in object_paths:
        field_path = get_field_path(object_path)
        ids = as_list(get_in(data, field_path))
        if not ids:
            return None
        field = field_path[-1]
        entity = re.sub(r"(\w+)URIs?", "\\1", field)
        entity_index = data_index[entity]
        try:
            dataList = []
            for id in ids:
                dataList.append(json.loads(entity_index[id].decode()))
            data = remove_none(dataList)
        except AttributeError:
            data = remove_none(list(map(lambda id: entity_index[id], ids)))

        if getattr(entity_index, 'close', False):
            entity_index.close()
    value_paths = as_list(get_in(tree, ['start', 'value_path']))
    if len(value_paths) == 1:
        return get_in(data, get_field_path(value_paths[0]))
    else:
        new_value = []
        for data in as_list(data):
            field_values = remove_empty(
                map(
                    lambda value_path: as_list(
                        get_in(data, get_field_path(value_path))) or None,
                    value_paths))
            product = itertools.product(*field_values)
            joined = map(
                lambda field_value: reduce(
                    lambda acc, s: s
                    if s.startswith(acc) else acc + " " + s, field_value, ""),
                product)
            if joined:
                new_value.extend(joined)
        return list(distinct(remove_empty(new_value)))
Example #11
0
def remove_internal_objects(entities):
    """
    Remove objects referenced inside others (example: trial.studies or study.location)
    """
    for (entity_name, entity) in entities.items():
        for link in (entity.get('links') or []):
            if link['type'] != 'internal-object':
                continue

            for (_, data) in entity['store'].items():
                link_path = link['json-path']
                link_path_list = remove_empty(link_path.split('.'))

                context_path, last = link_path_list[:-1], link_path_list[-1]
                link_context = get_in(data, context_path)
                if link_context and last in link_context:
                    del link_context[last]
    def test_get_in_invalid_multi_path(self):
        input_path = ["a", "c", "d"]
        input_value = [{
            "a": {
                "b": {
                    "c": 1
                }
            }
        }, {
            "a": {
                "b": [{
                    "c": 4
                }, {
                    "c": 5
                }]
            }
        }]

        expected = None
        actual = get_in(input_value, input_path)
        self.assertEqual(expected, actual)
def fetch_details(options):
    """
    Fetch details call for a BrAPI object (ex: /brapi/v1/studies/{id})
    """
    source, logger, entity, object_id = options
    if 'detail' not in entity:
        return
    detail_call_group = entity['detail']

    in_store = object_id in entity['store']
    skip_if_in_store = detail_call_group.get('skip-if-in-store')
    already_detailed = get_in(entity['store'], [object_id, 'etl:detailed'])
    if in_store and (skip_if_in_store or already_detailed):
        return

    entity_name = entity['name']
    entity_id = entity_name + 'DbId'
    detail_call = get_implemented_call(source, detail_call_group,
                                       {entity_id: object_id})

    if not detail_call:
        return

    details = BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'],
                                            detail_call, logger).__next__()
    details['etl:detailed'] = True

    # -----------------------------------------------------------------
    # Detect bugy endpoints that returns several studies instead of one.
    if "expect-single-result" in detail_call_group and detail_call_group[
            "expect-single-result"] and 'data' in details and len(
                details['data']) != 1:
        logger.debug(f"More than one results for {detail_call}")
        raise EndPointError(f"More than one results for {detail_call}")
    if 'data' in details and len(details['data']) == 1:
        details = details['data'][0]
    # -----------------------------------------------------------------

    return entity_name, [details]
def transform_uri_link(source: dict, entities: dict, ignore_links,
                       id_index_files: dict, entity_line: Tuple[str,
                                                                str]) -> dict:
    """
    Transform BrAPI data by adding URI links translated from DbId links and replacing DbIds with encoded URIs.
    Also checks entity links to make sure every referenced entity exists.
    """
    entity, line = entity_line
    data = remove_empty(json.loads(line))

    data_id = get_identifier(entity, data)
    data[f"{entity}DbId"] = str(data_id)

    data_uri = get_generate_uri(source, entity, data)
    data[f"{entity}URI"] = data_uri

    # Add basic JSON-LD fields (store URI as @id)
    data['@type'] = entity
    data['@id'] = data_uri

    # Add basic schema.org fields
    data['schema:includedInDataCatalog'] = source['@id']
    data['schema:identifier'] = data_id
    data['schema:name'] = data.get('schema:name') or data.get(entity + 'Name')

    # Create URI links for each DbId link
    id_links = get_entity_links(data, 'DbId')
    for linked_entity, link_path, link_value in id_links:
        if linked_entity in ignore_links:
            continue
        plural = 's' if is_collection(link_value) else ''
        link_uri_field = f"{linked_entity}URI{plural}"
        link_uri_path = [*link_path[:-1], link_uri_field]

        alias = None
        if linked_entity not in id_index_files:
            # Try to find an alias for the linked entity (ex: parent1 in pedigree is a germplasm)
            aliases = map(
                lambda l: l['entity-alias'],
                filter(
                    # Find a link for current entity
                    lambda l: l['entity'] == linked_entity and 'entity-alias'
                    in l,
                    # In entity links
                    get_in(entities, [data['@type'], 'links']) or []))
            alias = next(aliases, None)

        # Linked entity index by Id
        try:
            id_index_file = id_index_files[alias or linked_entity]
        except KeyError as e:
            raise MissingDataLink(
                f"No '{alias or linked_entity}' data available to verify '{link_path}' data link "
                f"in JSON object:\n"
                f"{data}\n"
                f"If you want to ignore the '{alias or linked_entity}' data links add it to the 'ignore-links' "
                f"config option.\n"
                f"If you want to extract the '{alias or linked_entity}' from '{data['@type']}', add an "
                f"'internal-object' link in the 'config/extract-brapi/entities/{data['@type']}' config file.\n"
                f"If the path '{link_path}' corresponds to another type of entity, add an 'internal' link"
                f"with a 'entity-alias' in the 'config/extract-brapi/entities/{data['@type']}' config file."
            ) from e

        # open read only
        uri_index = UnQLite(id_index_file,
                            flags=UNQLITE_OPEN_READONLY | UNQLITE_OPEN_MMAP)

        def get_in_index(link_id):
            try:
                return uri_index[link_id].decode()
            except KeyError as e:
                raise MissingDataLink(
                    f"Could not find '{alias or linked_entity}' with id '{link_id}' "
                    f"found in '{link_path}' of object:\n{data}") from e

        if plural:
            link_uri = list(map(get_in_index, link_value))
        else:
            link_uri = get_in_index(link_value)

        update_in(data, link_uri_path, link_uri)

    def encode_uri(uri):
        return base64.b64encode(str(uri).encode()).decode()

    # Replace DbId with b64 encoded URI
    uri_links = get_entity_links(data, 'URI')
    for linked_entity, link_path, link_value in uri_links:
        if linked_entity in ignore_links:
            continue
        plural = 's' if is_collection(link_value) else ''
        link_id_field = f"{linked_entity}DbId{plural}"
        link_id_path = [*link_path[:-1], link_id_field]

        if plural:
            link_id = list(map(encode_uri, link_value))
        else:
            link_id = encode_uri(link_value)

        update_in(data, link_id_path, link_id)

    return data
def get_field_path(tree):
    return get_in(tree, ['field_path', 'FIELD'])