コード例 #1
def resolve_field_value_template(tree, data, data_index):
    object_paths = as_list(resolve_path(tree, ['start', 'object_path']))
    for object_path in object_paths:
        ids = as_list(resolve_path(data, get_field_path(object_path)))
        if not ids:
            return None
        data = remove_none(list(map(lambda id: data_index.get(id), ids)))
    value_paths = as_list(resolve_path(tree, ['start', 'value_path']))
    if len(value_paths) == 1:
        return resolve_path(data, get_field_path(value_paths[0]))
        new_value = []
        for data in as_list(data):
            field_values = remove_empty(
                    lambda value_path: as_list(
                        resolve_path(data, get_field_path(value_path))) or
                    None, value_paths))
            product = itertools.product(*field_values)
            joined = map(
                lambda field_value: reduce(
                    lambda acc, s: s
                    if s.startswith(acc) else acc + " " + s, field_value, ""),
            if joined:
        return list(distinct(remove_empty(new_value)))
コード例 #2
def link_object(dest_entity_name, dest_object, src_object_id):
    dest_object_ref = dest_entity_name + 'DbIds'
    dest_object_ids = dest_object.get(dest_object_ref) or set()
    if not isinstance(dest_object_ids, set):
        dest_object_ids = set(dest_object_ids)
    dest_object[dest_object_ref] = remove_empty(dest_object_ids)
コード例 #3
def transform_parse_uri(source: dict, entities: dict,
                        entity_line: Tuple[str, str]) -> List[dict]:
    Parse JSON, get or generate ID, get or generate URI
    entity, line = entity_line
    data = json.loads(line)
    output = []

    def get_or_generate_uri(source, entity, data):
        data_id = get_identifier(entity, data)
        data_uri = get_generate_uri(source, entity, data)
        return {'@type': entity, '@id': data_uri, 'schema:identifier': data_id}

    # Extract internal objects (if any)
    internal_object_links = filter(lambda l: l['type'] == 'internal-object',
                                   get_in(entities, [entity, 'links']) or [])
    for link in internal_object_links:
        link_entity = link['entity']
        link_path = remove_empty(link['json-path'].split('.'))
        link_values = get_in(data, link_path)

        for link_value in as_list(link_values):
            # Output internal object
            output.append(get_or_generate_uri(source, link_entity, link_value))

    # Output current data object
    output.append(get_or_generate_uri(source, entity, data))
    return output
コード例 #4
def fetch_all_links(source, logger, entities):
    Link objects across entities.
     - Internal: link an object (ex: study) to another using an identifier inside the JSON object
      (ex: link a location via study.locationDbId)
     - Internal object: link an object (ex: study) to another contained inside the first
      (ex: link a location via study.location.locationDbId)
     - External object: link an object (ex: study) to another using a dedicated call
      (ex: link to observation variables via /brapi/v1/studies/{id}/observationVariables)
    for (entity_name, entity) in entities.items():
        if 'links' not in entity:

        for link in entity['links']:
            for (object_id, object) in entity['store'].items():
                linked_entity_name = link['entity']
                linked_entity = entities[linked_entity_name]
                linked_objects_by_id = {}

                if link['type'].startswith('internal'):
                    link_path = link['json-path']
                    link_path_list = remove_empty(link_path.split('.'))

                    link_values = remove_none(as_list(get_in(object, link_path_list)))
                    if not link_values:
                        if link.get('required'):
                            raise BrokenLink("Could not find required field '{}' in {} object id '{}'"
                                             .format(link_path, entity_name, object_id))

                    if link['type'] == 'internal-object':
                        for link_value in link_values:
                            link_id = get_identifier(linked_entity_name, link_value)
                            linked_objects_by_id[link_id] = link_value

                    elif link['type'] == 'internal':
                        link_id_field = linked_entity['name'] + 'DbId'
                        link_name_field = linked_entity['name'] + 'Name'
                        for link_value in link_values:
                            link_id = link_value.get(link_id_field)
                            link_name = link_value.get(link_name_field)
                            if link_id:
                                linked_objects_by_id[link_id] = {link_id_field: link_id, link_name_field: link_name}

                elif link['type'] == 'external-object':
                    call = get_implemented_call(source, link, context=object)
                    if not call:

                    link_values = list(BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], call, logger))
                    for link_value in link_values:
                        link_id = get_identifier(linked_entity_name, link_value)
                        linked_objects_by_id[link_id] = link_value

                link_objects(entity, object, linked_entity, linked_objects_by_id)
コード例 #5
 def add(self, data):
     # Compact object by removing nulls and empty
     data = remove_empty(data)
     if data:
         data['source'] = self.source_id
         data_id = get_identifier(self.entity_name, data)
         if data_id in self:
             dict_merge(self[data_id], data)
             self[data_id] = data
コード例 #6
def fetch_all_in_store(entities, fetch_function, arguments, pool):
    Run a fetch function with arguments in a pool worker and collect results in the entity MergeStore
    results = remove_empty(pool.imap_unordered(fetch_function, arguments, 4))
    if not results:

    for (entity_name, data_list) in results:
        for data in data_list:
コード例 #7
def resolve_map_template(template, data, data_index):
    elements = template.get('{map}')
    transform = template.get('{to}')

    resolved = as_list(resolve(elements, data, data_index))
    if not isinstance(resolved, list):
        raise Exception("Map can only work on lists.")
    if not remove_empty(resolved):
        return None
    resolved = list(
        map(lambda value: resolve(transform, value, data_index), resolved))
    return resolved
コード例 #8
def resolve_field_value_template(tree, data, data_index):
    object_paths = as_list(get_in(tree, ['start', 'object_path']))
    for object_path in object_paths:
        field_path = get_field_path(object_path)
        ids = as_list(get_in(data, field_path))
        if not ids:
            return None
        field = field_path[-1]
        entity = re.sub(r"(\w+)URIs?", "\\1", field)
        entity_index = data_index[entity]
            dataList = []
            for id in ids:
            data = remove_none(dataList)
        except AttributeError:
            data = remove_none(list(map(lambda id: entity_index[id], ids)))

        if getattr(entity_index, 'close', False):
    value_paths = as_list(get_in(tree, ['start', 'value_path']))
    if len(value_paths) == 1:
        return get_in(data, get_field_path(value_paths[0]))
        new_value = []
        for data in as_list(data):
            field_values = remove_empty(
                    lambda value_path: as_list(
                        get_in(data, get_field_path(value_path))) or None,
            product = itertools.product(*field_values)
            joined = map(
                lambda field_value: reduce(
                    lambda acc, s: s
                    if s.startswith(acc) else acc + " " + s, field_value, ""),
            if joined:
        return list(distinct(remove_empty(new_value)))
コード例 #9
def fetch_all_in_store(entities, fetch_function, arguments, pool):
    Run a fetch function with arguments in a pool worker and collect results in the entity MergeStore
    source_name = arguments[0][0]['schema:identifier']
    results = remove_empty(pool.imap_unordered(fetch_function, arguments, 4))
    if not results:

    for (entity_name, data_list) in results:
        for data in data_list:
            if source_name == 'WUR' and entity_name == 'study':
                data['startDate'] = data['startDate'] + "-01-01"
コード例 #10
def remove_internal_objects(entities):
    Remove objects referenced inside others (example: trial.studies or study.location)
    for (entity_name, entity) in entities.items():
        for link in (entity.get('links') or []):
            if link['type'] != 'internal-object':

            for (_, data) in entity['store'].items():
                link_path = link['json-path']
                link_path_list = remove_empty(link_path.split('.'))

                context_path, last = link_path_list[:-1], link_path_list[-1]
                link_context = get_in(data, context_path)
                if link_context and last in link_context:
                    del link_context[last]
コード例 #11
 def test_remove_empty(self):
     input_value = [None, [], "", {1, 2, None}, 0, {"a": {"b": {}}}]
     expected = [{1, 2}, 0]
     actual = remove_empty(input_value)
     self.assertEqual(expected, actual)
コード例 #12
def transform_uri_link(source: dict, entities: dict, ignore_links,
                       id_index_files: dict, entity_line: Tuple[str,
                                                                str]) -> dict:
    Transform BrAPI data by adding URI links translated from DbId links and replacing DbIds with encoded URIs.
    Also checks entity links to make sure every referenced entity exists.
    entity, line = entity_line
    data = remove_empty(json.loads(line))

    data_id = get_identifier(entity, data)
    data[f"{entity}DbId"] = str(data_id)

    data_uri = get_generate_uri(source, entity, data)
    data[f"{entity}URI"] = data_uri

    # Add basic JSON-LD fields (store URI as @id)
    data['@type'] = entity
    data['@id'] = data_uri

    # Add basic schema.org fields
    data['schema:includedInDataCatalog'] = source['@id']
    data['schema:identifier'] = data_id
    data['schema:name'] = data.get('schema:name') or data.get(entity + 'Name')

    # Create URI links for each DbId link
    id_links = get_entity_links(data, 'DbId')
    for linked_entity, link_path, link_value in id_links:
        if linked_entity in ignore_links:
        plural = 's' if is_collection(link_value) else ''
        link_uri_field = f"{linked_entity}URI{plural}"
        link_uri_path = [*link_path[:-1], link_uri_field]

        alias = None
        if linked_entity not in id_index_files:
            # Try to find an alias for the linked entity (ex: parent1 in pedigree is a germplasm)
            aliases = map(
                lambda l: l['entity-alias'],
                    # Find a link for current entity
                    lambda l: l['entity'] == linked_entity and 'entity-alias'
                    in l,
                    # In entity links
                    get_in(entities, [data['@type'], 'links']) or []))
            alias = next(aliases, None)

        # Linked entity index by Id
            id_index_file = id_index_files[alias or linked_entity]
        except KeyError as e:
            raise MissingDataLink(
                f"No '{alias or linked_entity}' data available to verify '{link_path}' data link "
                f"in JSON object:\n"
                f"If you want to ignore the '{alias or linked_entity}' data links add it to the 'ignore-links' "
                f"config option.\n"
                f"If you want to extract the '{alias or linked_entity}' from '{data['@type']}', add an "
                f"'internal-object' link in the 'config/extract-brapi/entities/{data['@type']}' config file.\n"
                f"If the path '{link_path}' corresponds to another type of entity, add an 'internal' link"
                f"with a 'entity-alias' in the 'config/extract-brapi/entities/{data['@type']}' config file."
            ) from e

        # open read only
        uri_index = UnQLite(id_index_file,
                            flags=UNQLITE_OPEN_READONLY | UNQLITE_OPEN_MMAP)

        def get_in_index(link_id):
                return uri_index[link_id].decode()
            except KeyError as e:
                raise MissingDataLink(
                    f"Could not find '{alias or linked_entity}' with id '{link_id}' "
                    f"found in '{link_path}' of object:\n{data}") from e

        if plural:
            link_uri = list(map(get_in_index, link_value))
            link_uri = get_in_index(link_value)

        update_in(data, link_uri_path, link_uri)

    def encode_uri(uri):
        return base64.b64encode(str(uri).encode()).decode()

    # Replace DbId with b64 encoded URI
    uri_links = get_entity_links(data, 'URI')
    for linked_entity, link_path, link_value in uri_links:
        if linked_entity in ignore_links:
        plural = 's' if is_collection(link_value) else ''
        link_id_field = f"{linked_entity}DbId{plural}"
        link_id_path = [*link_path[:-1], link_id_field]

        if plural:
            link_id = list(map(encode_uri, link_value))
            link_id = encode_uri(link_value)

        update_in(data, link_id_path, link_id)

    return data
コード例 #13
def launch_etl(options, config):
    def handler(*_):

    signal.signal(signal.SIGINT, handler)
    default_index_template = config['load-elasticsearch']['index-template']

    # Execute ETL actions based on CLI arguments:
    if 'extract' in options or 'etl_es' in options or 'etl_virtuoso' in options:

    if 'transform_elasticsearch' in options or 'etl_es' in options:
        transform_config = config['transform-elasticsearch']

        # Restrict lis of generated document if requested
        input_doc_types = options.get('document_types')
        if input_doc_types:
            transform_config['restricted-documents'] = set(

        # Copy base jsonschema definitions into each document jsonschema
        validation_config = transform_config['validation']
        base_definitions = validation_config['base-definitions']
        for (document_type,
             document_schema) in validation_config['documents'].items():
            document_schema['definitions'] = base_definitions

        # Run transform

    if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options:
        # Replace JSON-LD context path with absolute path
        for (entity_name,
             entity) in config['transform-jsonld']['entities'].items():
            if '@context' in entity:
                entity['@context'] = get_file_path(
                    [config['conf-dir'], entity['@context']])
                if not os.path.exists(entity['@context']):
                    raise Exception(
                        'JSON-LD context file "{}" defined in "{}" does not exist'

        # Replace JSON-LD model path with an absolute path
        config['transform-jsonld']['model'] = get_file_path(
            [config['conf-dir'], config['transform-jsonld']['model']])


    if 'transform_rdf' in options or 'etl_virtuoso' in options:

    if 'load_elasticsearch' in options or 'etl_es' in options:
        mapping_files = list_entity_files(
            os.path.join(config['conf-dir'], 'elasticsearch'))

        selected_document_types = None
        if 'document_types' in options and options['document_types']:
            selected_document_types = set(options['document_types'].split(','))
        config['load-elasticsearch']['url'] = '{}:{}'.format(
            options['host'], options['port'])
        config['load-elasticsearch']['mappings'] = {
            document_type: file_path
            for document_type, file_path in mapping_files
        config['load-elasticsearch']['index-template'] = options.get(
            'index_template') or default_index_template
            'document-types'] = selected_document_types

    if 'load_virtuoso' in options or 'etl_virtuoso' in options:
コード例 #14
def extend_config(config, options):
    Extend the configuration with the options provided in CLI arguments

    config['options'] = options

    # Data output dir
    config['data-dir'] = get_folder_path(
        [options.get('data_dir') or config['default-data-dir']], create=True)

    # Sources config
    config['sources'] = dict()
    source_id_field = 'schema:identifier'
    for source_file in (options.get('sources') or list()):
        source_config = json.loads(source_file.read())
        if source_id_field not in source_config:
            raise Exception(
                "No field '{}' in data source JSON configuration file '{}'".
                format(source_id_field, source_file.name))
        identifier = source_config[source_id_field]
        if identifier in config['sources']:
            raise Exception(
                "Source id '{}' found twice in source list: {}\n"
                "Please verify the '{}' field in your files.".format(
                    identifier, options['sources'], source_id_field))
        config['sources'][identifier] = source_config

    if 'transform_elasticsearch' in options or 'etl_es' in options:
        transform_config = config['transform-elasticsearch']
        transform_config['documents'] = list(

        # Restrict lis of generated document if requested
        input_doc_types = options.get('document_types')
        if input_doc_types:
            transform_config['restricted-documents'] = set(

        # Copy base jsonschema definitions into each document jsonschema
        validation_schemas = transform_config['validation-schemas']
        base_definitions = validation_schemas['base-definitions']
        for (document_type, document_schema) in validation_schemas.items():
            if document_schema != base_definitions:
                document_schema['definitions'] = base_definitions

    if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options:
        # Replace JSON-LD context path with absolute path
        for (entity_name,
             entity) in config['transform-jsonld']['entities'].items():
            if '@context' in entity:
                entity['@context'] = get_file_path(
                    [config['conf-dir'], entity['@context']])
                if not os.path.exists(entity['@context']):
                    raise Exception(
                        'JSON-LD context file "{}" defined in "{}" does not exist'

        # Replace JSON-LD model path with an absolute path
        config['transform-jsonld']['model'] = get_file_path(
            [config['conf-dir'], config['transform-jsonld']['model']])

    if 'load_elasticsearch' in options or 'etl_es' in options:
        load_elasticsearch = config['load-elasticsearch']

        # CLI selected list of document types
        selected_document_types = None
        if 'document_types' in options and options['document_types']:
            selected_document_types = set(options['document_types'].split(','))
        load_elasticsearch['document-types'] = selected_document_types

        elasticsearch_config = load_elasticsearch['config']

        load_elasticsearch['index-template'] = options.get(
            'index_template') or elasticsearch_config['index-template']

        load_elasticsearch['url'] = '{}:{}'.format(
            options['host'] or elasticsearch_config['host'], options['port']
            or elasticsearch_config['port'])
    return config