コード例 #1
0
def resolve_field_value_template(tree, data, data_index):
    object_paths = as_list(resolve_path(tree, ['start', 'object_path']))
    for object_path in object_paths:
        ids = as_list(resolve_path(data, get_field_path(object_path)))
        if not ids:
            return None
        data = remove_none(list(map(lambda id: data_index.get(id), ids)))
    value_paths = as_list(resolve_path(tree, ['start', 'value_path']))
    if len(value_paths) == 1:
        return resolve_path(data, get_field_path(value_paths[0]))
    else:
        new_value = []
        for data in as_list(data):
            field_values = remove_empty(
                map(
                    lambda value_path: as_list(
                        resolve_path(data, get_field_path(value_path))) or
                    None, value_paths))
            product = itertools.product(*field_values)
            joined = map(
                lambda field_value: reduce(
                    lambda acc, s: s
                    if s.startswith(acc) else acc + " " + s, field_value, ""),
                product)
            if joined:
                new_value.extend(joined)
        return list(distinct(remove_empty(new_value)))
コード例 #2
0
def link_object(dest_entity_name, dest_object, src_object_id):
    dest_object_ref = dest_entity_name + 'DbIds'
    dest_object_ids = dest_object.get(dest_object_ref) or set()
    if not isinstance(dest_object_ids, set):
        dest_object_ids = set(dest_object_ids)
    dest_object_ids.add(src_object_id)
    dest_object[dest_object_ref] = remove_empty(dest_object_ids)
コード例 #3
0
def transform_parse_uri(source: dict, entities: dict,
                        entity_line: Tuple[str, str]) -> List[dict]:
    """
    Parse JSON, get or generate ID, get or generate URI
    """
    entity, line = entity_line
    data = json.loads(line)
    output = []

    def get_or_generate_uri(source, entity, data):
        data_id = get_identifier(entity, data)
        data_uri = get_generate_uri(source, entity, data)
        return {'@type': entity, '@id': data_uri, 'schema:identifier': data_id}

    # Extract internal objects (if any)
    internal_object_links = filter(lambda l: l['type'] == 'internal-object',
                                   get_in(entities, [entity, 'links']) or [])
    for link in internal_object_links:
        link_entity = link['entity']
        link_path = remove_empty(link['json-path'].split('.'))
        link_values = get_in(data, link_path)

        for link_value in as_list(link_values):
            # Output internal object
            output.append(get_or_generate_uri(source, link_entity, link_value))

    # Output current data object
    output.append(get_or_generate_uri(source, entity, data))
    return output
コード例 #4
0
def fetch_all_links(source, logger, entities):
    """
    Link objects across entities.
     - Internal: link an object (ex: study) to another using an identifier inside the JSON object
      (ex: link a location via study.locationDbId)
     - Internal object: link an object (ex: study) to another contained inside the first
      (ex: link a location via study.location.locationDbId)
     - External object: link an object (ex: study) to another using a dedicated call
      (ex: link to observation variables via /brapi/v1/studies/{id}/observationVariables)
    """
    for (entity_name, entity) in entities.items():
        if 'links' not in entity:
            continue

        for link in entity['links']:
            for (object_id, object) in entity['store'].items():
                linked_entity_name = link['entity']
                linked_entity = entities[linked_entity_name]
                linked_objects_by_id = {}

                if link['type'].startswith('internal'):
                    link_path = link['json-path']
                    link_path_list = remove_empty(link_path.split('.'))

                    link_values = remove_none(as_list(get_in(object, link_path_list)))
                    if not link_values:
                        if link.get('required'):
                            raise BrokenLink("Could not find required field '{}' in {} object id '{}'"
                                             .format(link_path, entity_name, object_id))
                        continue

                    if link['type'] == 'internal-object':
                        for link_value in link_values:
                            link_id = get_identifier(linked_entity_name, link_value)
                            linked_objects_by_id[link_id] = link_value

                    elif link['type'] == 'internal':
                        link_id_field = linked_entity['name'] + 'DbId'
                        link_name_field = linked_entity['name'] + 'Name'
                        for link_value in link_values:
                            link_id = link_value.get(link_id_field)
                            link_name = link_value.get(link_name_field)
                            if link_id:
                                linked_objects_by_id[link_id] = {link_id_field: link_id, link_name_field: link_name}

                elif link['type'] == 'external-object':
                    call = get_implemented_call(source, link, context=object)
                    if not call:
                        continue

                    link_values = list(BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], call, logger))
                    for link_value in link_values:
                        link_id = get_identifier(linked_entity_name, link_value)
                        linked_objects_by_id[link_id] = link_value

                link_objects(entity, object, linked_entity, linked_objects_by_id)
コード例 #5
0
 def add(self, data):
     # Compact object by removing nulls and empty
     data = remove_empty(data)
     if data:
         data['source'] = self.source_id
         data_id = get_identifier(self.entity_name, data)
         if data_id in self:
             dict_merge(self[data_id], data)
         else:
             self[data_id] = data
コード例 #6
0
def fetch_all_in_store(entities, fetch_function, arguments, pool):
    """
    Run a fetch function with arguments in a pool worker and collect results in the entity MergeStore
    """
    results = remove_empty(pool.imap_unordered(fetch_function, arguments, 4))
    if not results:
        return

    for (entity_name, data_list) in results:
        for data in data_list:
            entities[entity_name]['store'].add(data)
コード例 #7
0
def resolve_map_template(template, data, data_index):
    elements = template.get('{map}')
    transform = template.get('{to}')

    resolved = as_list(resolve(elements, data, data_index))
    if not isinstance(resolved, list):
        raise Exception("Map can only work on lists.")
    if not remove_empty(resolved):
        return None
    resolved = list(
        map(lambda value: resolve(transform, value, data_index), resolved))
    return resolved
コード例 #8
0
def resolve_field_value_template(tree, data, data_index):
    object_paths = as_list(get_in(tree, ['start', 'object_path']))
    for object_path in object_paths:
        field_path = get_field_path(object_path)
        ids = as_list(get_in(data, field_path))
        if not ids:
            return None
        field = field_path[-1]
        entity = re.sub(r"(\w+)URIs?", "\\1", field)
        entity_index = data_index[entity]
        try:
            dataList = []
            for id in ids:
                dataList.append(json.loads(entity_index[id].decode()))
            data = remove_none(dataList)
        except AttributeError:
            data = remove_none(list(map(lambda id: entity_index[id], ids)))

        if getattr(entity_index, 'close', False):
            entity_index.close()
    value_paths = as_list(get_in(tree, ['start', 'value_path']))
    if len(value_paths) == 1:
        return get_in(data, get_field_path(value_paths[0]))
    else:
        new_value = []
        for data in as_list(data):
            field_values = remove_empty(
                map(
                    lambda value_path: as_list(
                        get_in(data, get_field_path(value_path))) or None,
                    value_paths))
            product = itertools.product(*field_values)
            joined = map(
                lambda field_value: reduce(
                    lambda acc, s: s
                    if s.startswith(acc) else acc + " " + s, field_value, ""),
                product)
            if joined:
                new_value.extend(joined)
        return list(distinct(remove_empty(new_value)))
コード例 #9
0
def fetch_all_in_store(entities, fetch_function, arguments, pool):
    """
    Run a fetch function with arguments in a pool worker and collect results in the entity MergeStore
    """
    source_name = arguments[0][0]['schema:identifier']
    results = remove_empty(pool.imap_unordered(fetch_function, arguments, 4))
    if not results:
        return

    for (entity_name, data_list) in results:
        for data in data_list:
            if source_name == 'WUR' and entity_name == 'study':
                data['startDate'] = data['startDate'] + "-01-01"
            entities[entity_name]['store'].add(data)
コード例 #10
0
def remove_internal_objects(entities):
    """
    Remove objects referenced inside others (example: trial.studies or study.location)
    """
    for (entity_name, entity) in entities.items():
        for link in (entity.get('links') or []):
            if link['type'] != 'internal-object':
                continue

            for (_, data) in entity['store'].items():
                link_path = link['json-path']
                link_path_list = remove_empty(link_path.split('.'))

                context_path, last = link_path_list[:-1], link_path_list[-1]
                link_context = get_in(data, context_path)
                if link_context and last in link_context:
                    del link_context[last]
コード例 #11
0
 def test_remove_empty(self):
     input_value = [None, [], "", {1, 2, None}, 0, {"a": {"b": {}}}]
     expected = [{1, 2}, 0]
     actual = remove_empty(input_value)
     self.assertEqual(expected, actual)
コード例 #12
0
def transform_uri_link(source: dict, entities: dict, ignore_links,
                       id_index_files: dict, entity_line: Tuple[str,
                                                                str]) -> dict:
    """
    Transform BrAPI data by adding URI links translated from DbId links and replacing DbIds with encoded URIs.
    Also checks entity links to make sure every referenced entity exists.
    """
    entity, line = entity_line
    data = remove_empty(json.loads(line))

    data_id = get_identifier(entity, data)
    data[f"{entity}DbId"] = str(data_id)

    data_uri = get_generate_uri(source, entity, data)
    data[f"{entity}URI"] = data_uri

    # Add basic JSON-LD fields (store URI as @id)
    data['@type'] = entity
    data['@id'] = data_uri

    # Add basic schema.org fields
    data['schema:includedInDataCatalog'] = source['@id']
    data['schema:identifier'] = data_id
    data['schema:name'] = data.get('schema:name') or data.get(entity + 'Name')

    # Create URI links for each DbId link
    id_links = get_entity_links(data, 'DbId')
    for linked_entity, link_path, link_value in id_links:
        if linked_entity in ignore_links:
            continue
        plural = 's' if is_collection(link_value) else ''
        link_uri_field = f"{linked_entity}URI{plural}"
        link_uri_path = [*link_path[:-1], link_uri_field]

        alias = None
        if linked_entity not in id_index_files:
            # Try to find an alias for the linked entity (ex: parent1 in pedigree is a germplasm)
            aliases = map(
                lambda l: l['entity-alias'],
                filter(
                    # Find a link for current entity
                    lambda l: l['entity'] == linked_entity and 'entity-alias'
                    in l,
                    # In entity links
                    get_in(entities, [data['@type'], 'links']) or []))
            alias = next(aliases, None)

        # Linked entity index by Id
        try:
            id_index_file = id_index_files[alias or linked_entity]
        except KeyError as e:
            raise MissingDataLink(
                f"No '{alias or linked_entity}' data available to verify '{link_path}' data link "
                f"in JSON object:\n"
                f"{data}\n"
                f"If you want to ignore the '{alias or linked_entity}' data links add it to the 'ignore-links' "
                f"config option.\n"
                f"If you want to extract the '{alias or linked_entity}' from '{data['@type']}', add an "
                f"'internal-object' link in the 'config/extract-brapi/entities/{data['@type']}' config file.\n"
                f"If the path '{link_path}' corresponds to another type of entity, add an 'internal' link"
                f"with a 'entity-alias' in the 'config/extract-brapi/entities/{data['@type']}' config file."
            ) from e

        # open read only
        uri_index = UnQLite(id_index_file,
                            flags=UNQLITE_OPEN_READONLY | UNQLITE_OPEN_MMAP)

        def get_in_index(link_id):
            try:
                return uri_index[link_id].decode()
            except KeyError as e:
                raise MissingDataLink(
                    f"Could not find '{alias or linked_entity}' with id '{link_id}' "
                    f"found in '{link_path}' of object:\n{data}") from e

        if plural:
            link_uri = list(map(get_in_index, link_value))
        else:
            link_uri = get_in_index(link_value)

        update_in(data, link_uri_path, link_uri)

    def encode_uri(uri):
        return base64.b64encode(str(uri).encode()).decode()

    # Replace DbId with b64 encoded URI
    uri_links = get_entity_links(data, 'URI')
    for linked_entity, link_path, link_value in uri_links:
        if linked_entity in ignore_links:
            continue
        plural = 's' if is_collection(link_value) else ''
        link_id_field = f"{linked_entity}DbId{plural}"
        link_id_path = [*link_path[:-1], link_id_field]

        if plural:
            link_id = list(map(encode_uri, link_value))
        else:
            link_id = encode_uri(link_value)

        update_in(data, link_id_path, link_id)

    return data
コード例 #13
0
def launch_etl(options, config):
    def handler(*_):
        sys.exit(0)

    signal.signal(signal.SIGINT, handler)
    default_index_template = config['load-elasticsearch']['index-template']

    # Execute ETL actions based on CLI arguments:
    if 'extract' in options or 'etl_es' in options or 'etl_virtuoso' in options:
        etl.extract.brapi.main(config)

    if 'transform_elasticsearch' in options or 'etl_es' in options:
        transform_config = config['transform-elasticsearch']

        # Restrict lis of generated document if requested
        input_doc_types = options.get('document_types')
        if input_doc_types:
            transform_config['restricted-documents'] = set(
                remove_empty(input_doc_types.split(',')))

        # Copy base jsonschema definitions into each document jsonschema
        validation_config = transform_config['validation']
        base_definitions = validation_config['base-definitions']
        for (document_type,
             document_schema) in validation_config['documents'].items():
            document_schema['definitions'] = base_definitions

        # Run transform
        etl.transform.elasticsearch.main(config)

    if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options:
        # Replace JSON-LD context path with absolute path
        for (entity_name,
             entity) in config['transform-jsonld']['entities'].items():
            if '@context' in entity:
                entity['@context'] = get_file_path(
                    [config['conf-dir'], entity['@context']])
                if not os.path.exists(entity['@context']):
                    raise Exception(
                        'JSON-LD context file "{}" defined in "{}" does not exist'
                        .format(
                            entity['@context'],
                            os.path.join(config['conf-dir'],
                                         'transform-jsonld.json')))

        # Replace JSON-LD model path with an absolute path
        config['transform-jsonld']['model'] = get_file_path(
            [config['conf-dir'], config['transform-jsonld']['model']])

        etl.transform.jsonld.main(config)

    if 'transform_rdf' in options or 'etl_virtuoso' in options:
        etl.transform.rdf.main(config)

    if 'load_elasticsearch' in options or 'etl_es' in options:
        mapping_files = list_entity_files(
            os.path.join(config['conf-dir'], 'elasticsearch'))

        selected_document_types = None
        if 'document_types' in options and options['document_types']:
            selected_document_types = set(options['document_types'].split(','))
        config['load-elasticsearch']['url'] = '{}:{}'.format(
            options['host'], options['port'])
        config['load-elasticsearch']['mappings'] = {
            document_type: file_path
            for document_type, file_path in mapping_files
        }
        config['load-elasticsearch']['index-template'] = options.get(
            'index_template') or default_index_template
        config['load-elasticsearch'][
            'document-types'] = selected_document_types
        etl.load.elasticsearch.main(config)

    if 'load_virtuoso' in options or 'etl_virtuoso' in options:
        etl.load.virtuoso.main(config)
コード例 #14
0
def extend_config(config, options):
    """
    Extend the configuration with the options provided in CLI arguments

    """
    config['options'] = options

    # Data output dir
    config['data-dir'] = get_folder_path(
        [options.get('data_dir') or config['default-data-dir']], create=True)

    # Sources config
    config['sources'] = dict()
    source_id_field = 'schema:identifier'
    for source_file in (options.get('sources') or list()):
        source_config = json.loads(source_file.read())
        if source_id_field not in source_config:
            raise Exception(
                "No field '{}' in data source JSON configuration file '{}'".
                format(source_id_field, source_file.name))
        identifier = source_config[source_id_field]
        if identifier in config['sources']:
            raise Exception(
                "Source id '{}' found twice in source list: {}\n"
                "Please verify the '{}' field in your files.".format(
                    identifier, options['sources'], source_id_field))
        config['sources'][identifier] = source_config

    if 'transform_elasticsearch' in options or 'etl_es' in options:
        transform_config = config['transform-elasticsearch']
        transform_config['documents'] = list(
            transform_config['documents'].values())

        # Restrict lis of generated document if requested
        input_doc_types = options.get('document_types')
        if input_doc_types:
            transform_config['restricted-documents'] = set(
                remove_empty(input_doc_types.split(',')))

        # Copy base jsonschema definitions into each document jsonschema
        validation_schemas = transform_config['validation-schemas']
        base_definitions = validation_schemas['base-definitions']
        for (document_type, document_schema) in validation_schemas.items():
            if document_schema != base_definitions:
                document_schema['definitions'] = base_definitions

    if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options:
        # Replace JSON-LD context path with absolute path
        for (entity_name,
             entity) in config['transform-jsonld']['entities'].items():
            if '@context' in entity:
                entity['@context'] = get_file_path(
                    [config['conf-dir'], entity['@context']])
                if not os.path.exists(entity['@context']):
                    raise Exception(
                        'JSON-LD context file "{}" defined in "{}" does not exist'
                        .format(
                            entity['@context'],
                            os.path.join(config['conf-dir'],
                                         'transform-jsonld.json')))

        # Replace JSON-LD model path with an absolute path
        config['transform-jsonld']['model'] = get_file_path(
            [config['conf-dir'], config['transform-jsonld']['model']])

    if 'load_elasticsearch' in options or 'etl_es' in options:
        load_elasticsearch = config['load-elasticsearch']

        # CLI selected list of document types
        selected_document_types = None
        if 'document_types' in options and options['document_types']:
            selected_document_types = set(options['document_types'].split(','))
        load_elasticsearch['document-types'] = selected_document_types

        elasticsearch_config = load_elasticsearch['config']

        load_elasticsearch['index-template'] = options.get(
            'index_template') or elasticsearch_config['index-template']

        load_elasticsearch['url'] = '{}:{}'.format(
            options['host'] or elasticsearch_config['host'], options['port']
            or elasticsearch_config['port'])
    return config