def get_jsonld_contexts(base_dir, config):
    jsonld_contexts = dict()
    for entity_name in config['entities']:
        if '@context' in config['entities'][entity_name]:
            jsonld_contexts[entity_name] = get_file_path(
                [base_dir, config['entities'][entity_name]['@context']])
    return jsonld_contexts
def list_entity_files(json_dir):
    for file_name in os.listdir(json_dir):
        matches = re.search('^([a-zA-Z]+).*\.json$', file_name)
        if not matches:
            continue
        entity_name = matches.groups()[0]
        json_path = get_file_path([json_dir, file_name])
        yield entity_name, json_path
 def _new_file(self):
     json_path = None
     while not json_path or os.path.exists(json_path):
         self.file_index += 1
         json_path = get_file_path([self.output_dir, self.base_json_name],
                                   ext="-" + str(self.file_index) + ".json")
         if self.file_index > 1000000:
             raise Exception('Max file index exceeded')
     return open(json_path, 'a')
Example #4
0
def load_folder(graph_uri, endpoint_rdf_dir, virtuoso_config):
    print('Loading RDF from "{}" \n\tinto Virtuoso graph <{}> on "{}"'.format(
        endpoint_rdf_dir, graph_uri, virtuoso_config['url']))
    for file_name in os.listdir(endpoint_rdf_dir):
        rdf_path = get_file_path([endpoint_rdf_dir, file_name])
        if not os.path.exists(rdf_path):
            continue

        load_rdf(virtuoso_config, rdf_path, graph_uri)
def extract_source(source, entities, config, output_dir):
    """
    Full JSON BrAPI source extraction process
    """
    source_name = source['schema:identifier']
    action = 'extract-' + source_name
    log_file = get_file_path([config['log-dir'], action],
                             ext='.log',
                             recreate=True)
    logger = create_logger(action, log_file, config['options']['verbose'])
    pool = ThreadPool(10)

    logger.info("Extracting BrAPI {}...".format(source_name))
    try:
        # Initialize JSON merge stores
        for (entity_name, entity) in entities.items():
            entity['store'] = MergeStore(source['schema:identifier'],
                                         entity['name'])

        # Fetch server implemented calls
        if 'implemented-calls' not in source:
            source['implemented-calls'] = get_implemented_calls(source, logger)

        # Fetch entities lists
        fetch_all_list(source, logger, entities, pool)

        # Detail entities
        fetch_all_details(source, logger, entities, pool)

        # Link entities (internal links, internal object links and external object links)
        fetch_all_links(source, logger, entities)

        # Detail entities (for object that might have been discovered by links)
        fetch_all_details(source, logger, entities, pool)

        remove_internal_objects(entities)

        logger.info("SUCCEEDED Extracting BrAPI {}.".format(source_name))
    except:
        logger.debug(traceback.format_exc())
        shutil.rmtree(output_dir)
        output_dir = output_dir + '-failed'
        logger.info(
            "FAILED Extracting BrAPI {}.\n"
            "=> Check the logs ({}) and data ({}) for more details.".format(
                source_name, log_file, output_dir))
    pool.close()

    # Save to file
    logger.info("Saving BrAPI {} to '{}'...".format(source_name, output_dir))
    for (entity_name, entity) in entities.items():
        entity['store'].save(output_dir)
        entity['store'].clear()
    def save(self, output_dir):
        if len(self) <= 0:
            return
        json_path = get_file_path([output_dir, self.entity_name],
                                  ext='.json',
                                  create=True)

        with open(json_path, 'w') as json_file:
            for data in self.values():
                if 'etl:detailed' in data:
                    del data['etl:detailed']
                CustomJSONEncoder.dump(data, json_file)
                json_file.write('\n')
def index_by(index_dir: str, index_extension: str, data_iter: iter,
             key_fn: Callable, value_fn: Callable, checkpoint: int,
             object_name: str):
    """
    Generate UnQlite data indices for each entity
    :param index_dir index directory
    :param index_extension index file extension
    :param data_iter iterable on data
    :param key_fn function to use on data to get the index key
    :param value_fn function to use on data to get the index value
    :param checkpoint commit index every checkpoints
    :return dict of index paths by entity name
    """
    i = 0
    index_path_by_entity = {}
    index_by_entity = {}
    for data in data_iter:
        entity = data['@type']
        if entity not in index_path_by_entity:
            index_path = get_file_path([index_dir, entity],
                                       ext=index_extension)
            index_path_by_entity[entity] = index_path

            index = UnQLite(index_path_by_entity[entity])
            index.begin()
            index_by_entity[entity] = index
        index = index_by_entity[entity]

        # Index
        index[str(key_fn(data))] = value_fn(data)

        i += 1
        # Log
        if i % 50000 == 0:
            print(f'checkpoint: {i} {object_name}')
        # Checkpoint
        if i % checkpoint == 0:
            # Flush indices
            for index in index_by_entity.values():
                index.commit()
                index.begin()
    print(f'checkpoint: {i} {object_name}')

    # Close indices
    for index in index_by_entity.values():
        index.commit()
        index.close()

    # Output all indices
    return index_path_by_entity
def transform_folder(institution_add_jsonld, json_dir, jsonld_dir):
    print('Transforming JSON from "{}" \n\tto JSON-LD in "{}"'.format(
        json_dir, jsonld_dir))

    # List of options
    options = list()
    for file_name in os.listdir(json_dir):
        matches = re.search('(\D+)(\d+).json', file_name)
        if matches:
            (entity_name, index) = matches.groups()

            src_path = get_file_path([json_dir, entity_name],
                                     ext=str(index) + '.json')
            dest_path = get_file_path([jsonld_dir, entity_name],
                                      ext=str(index) + '.jsonld')

            # Partial function application
            entity_add_jsonld = functools.partial(institution_add_jsonld,
                                                  entity_name)

            options.append((entity_add_jsonld, src_path, dest_path))

    # Run transform_to_jsonld on a thread pool
    pool_worker(transform_to_jsonld, options)
def read_json_lines(json_dir: str, out_queue: Queue):
    """
    Read JSON in source dir for each entity and output into queue
    """
    # List JSON files for each entities
    try:
        file_names = filter(lambda f: f.endswith(".json"),
                            os.listdir(json_dir))
    except FileNotFoundError:
        raise FileNotFoundError(
            f"No such file or directory: '{json_dir}'.\n"
            'Please make sure you have run the BrAPI extraction before trying to launch the transformation process.'
        )
    file_readers = {}
    for file_name in file_names:
        # Use file base name a the entity name
        entity = os.path.splitext(os.path.basename(file_name))[0]
        file_readers[entity] = open(get_file_path([json_dir, file_name]), 'r')

        # Read line
        # with open(get_file_path([json_dir, file_name]), 'r') as file:
        #    for line in file:
        #        out_queue.put((entity, line))

    # Alternatively read lines from each file (uniformize data flow)
    while file_readers:
        for entity, file in list(file_readers.items()):
            line = file.readline()
            if not line:
                file.close()
                del file_readers[entity]
            else:
                out_queue.put((entity, line))

    # Signal no more data
    out_queue.put(None)
Example #10
0
def load_source(source, config, source_bulk_dir, log_dir):
    """
    Full Elasticsearch documents indexing
    """
    source_name = source['schema:identifier']
    action = 'load-elasticsearch-' + source_name
    log_file = get_file_path([log_dir, action], ext='.log', recreate=True)
    logger = create_logger(source_name, log_file, config['verbose'])

    load_config = config['load-elasticsearch']
    es_client = init_es_client(load_config['url'], logger)

    logger.info("Loading '{}' into elasticsearch '{}'...".format(source_bulk_dir, load_config['url']))
    try:
        if not os.path.exists(source_bulk_dir):
            raise FileNotFoundError(
                'No such file or directory: \'{}\'.\n'
                'Please make sure you have run the BrAPI extraction and Elasticsearch document transformation'
                ' before trying to launch the transformation process.'
                .format(source_bulk_dir))

        bulk_files = list(list_entity_files(source_bulk_dir))
        all_document_types = set(map(first, bulk_files))
        document_types = load_config.get('document-types') or all_document_types
        document_types = document_types.intersection(all_document_types)

        index_by_document = dict()

        logger.info("Preparing index with template mapping...")
        timestamp = int(time.time())
        for document_type in document_types:
            base_index_name = replace_template(
                load_config['index-template'],
                {'source': source['schema:identifier'], 'documentType': document_type}
            ).lower()
            create_template(es_client, load_config, document_type, base_index_name, logger)

            index_name = base_index_name + '-d' + str(timestamp)
            create_index(es_client, index_name, logger)
            index_by_document[document_type] = base_index_name, index_name

        logger.info("Bulk indexing...")
        for document_type, file_path in bulk_files:
            if document_type in index_by_document:
                base_index_name, index_name = index_by_document[document_type]
                bulk_index(es_client, index_name, file_path, logger)

        logger.info("Creating index aliases and deleting old indices...")
        for document_type, (base_index_name, index_name) in index_by_document.items():
            create_alias(es_client, index_name, base_index_name, logger)
            new_index, *old_indices = get_indices(es_client, base_index_name)
            for old_index in old_indices[1:]:
                delete_index(es_client, old_index, logger)

        logger.info("SUCCEEDED Loading {}.".format(source_name))
    except Exception as e:
        logger.debug(traceback.format_exc())
        logger.debug(getattr(e, 'long_message', ''))
        logger.info("FAILED Loading {} Elasticsearch documents.\n"
                    "=> Check the logs ({}) for more details."
                    .format(source_name, log_file))
Example #11
0
def launch_etl(options, config):
    def handler(*_):
        sys.exit(0)

    signal.signal(signal.SIGINT, handler)
    default_index_template = config['load-elasticsearch']['index-template']

    # Execute ETL actions based on CLI arguments:
    if 'extract' in options or 'etl_es' in options or 'etl_virtuoso' in options:
        etl.extract.brapi.main(config)

    if 'transform_elasticsearch' in options or 'etl_es' in options:
        transform_config = config['transform-elasticsearch']

        # Restrict lis of generated document if requested
        input_doc_types = options.get('document_types')
        if input_doc_types:
            transform_config['restricted-documents'] = set(
                remove_empty(input_doc_types.split(',')))

        # Copy base jsonschema definitions into each document jsonschema
        validation_config = transform_config['validation']
        base_definitions = validation_config['base-definitions']
        for (document_type,
             document_schema) in validation_config['documents'].items():
            document_schema['definitions'] = base_definitions

        # Run transform
        etl.transform.elasticsearch.main(config)

    if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options:
        # Replace JSON-LD context path with absolute path
        for (entity_name,
             entity) in config['transform-jsonld']['entities'].items():
            if '@context' in entity:
                entity['@context'] = get_file_path(
                    [config['conf-dir'], entity['@context']])
                if not os.path.exists(entity['@context']):
                    raise Exception(
                        'JSON-LD context file "{}" defined in "{}" does not exist'
                        .format(
                            entity['@context'],
                            os.path.join(config['conf-dir'],
                                         'transform-jsonld.json')))

        # Replace JSON-LD model path with an absolute path
        config['transform-jsonld']['model'] = get_file_path(
            [config['conf-dir'], config['transform-jsonld']['model']])

        etl.transform.jsonld.main(config)

    if 'transform_rdf' in options or 'etl_virtuoso' in options:
        etl.transform.rdf.main(config)

    if 'load_elasticsearch' in options or 'etl_es' in options:
        mapping_files = list_entity_files(
            os.path.join(config['conf-dir'], 'elasticsearch'))

        selected_document_types = None
        if 'document_types' in options and options['document_types']:
            selected_document_types = set(options['document_types'].split(','))
        config['load-elasticsearch']['url'] = '{}:{}'.format(
            options['host'], options['port'])
        config['load-elasticsearch']['mappings'] = {
            document_type: file_path
            for document_type, file_path in mapping_files
        }
        config['load-elasticsearch']['index-template'] = options.get(
            'index_template') or default_index_template
        config['load-elasticsearch'][
            'document-types'] = selected_document_types
        etl.load.elasticsearch.main(config)

    if 'load_virtuoso' in options or 'etl_virtuoso' in options:
        etl.load.virtuoso.main(config)
Example #12
0
def extend_config(config, options):
    """
    Extend the configuration with the options provided in CLI arguments

    """
    config['options'] = options

    # Data output dir
    config['data-dir'] = get_folder_path(
        [options.get('data_dir') or config['default-data-dir']], create=True)

    # Sources config
    config['sources'] = dict()
    source_id_field = 'schema:identifier'
    for source_file in (options.get('sources') or list()):
        source_config = json.loads(source_file.read())
        if source_id_field not in source_config:
            raise Exception(
                "No field '{}' in data source JSON configuration file '{}'".
                format(source_id_field, source_file.name))
        identifier = source_config[source_id_field]
        if identifier in config['sources']:
            raise Exception(
                "Source id '{}' found twice in source list: {}\n"
                "Please verify the '{}' field in your files.".format(
                    identifier, options['sources'], source_id_field))
        config['sources'][identifier] = source_config

    if 'transform_elasticsearch' in options or 'etl_es' in options:
        transform_config = config['transform-elasticsearch']
        transform_config['documents'] = list(
            transform_config['documents'].values())

        # Restrict lis of generated document if requested
        input_doc_types = options.get('document_types')
        if input_doc_types:
            transform_config['restricted-documents'] = set(
                remove_empty(input_doc_types.split(',')))

        # Copy base jsonschema definitions into each document jsonschema
        validation_schemas = transform_config['validation-schemas']
        base_definitions = validation_schemas['base-definitions']
        for (document_type, document_schema) in validation_schemas.items():
            if document_schema != base_definitions:
                document_schema['definitions'] = base_definitions

    if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options:
        # Replace JSON-LD context path with absolute path
        for (entity_name,
             entity) in config['transform-jsonld']['entities'].items():
            if '@context' in entity:
                entity['@context'] = get_file_path(
                    [config['conf-dir'], entity['@context']])
                if not os.path.exists(entity['@context']):
                    raise Exception(
                        'JSON-LD context file "{}" defined in "{}" does not exist'
                        .format(
                            entity['@context'],
                            os.path.join(config['conf-dir'],
                                         'transform-jsonld.json')))

        # Replace JSON-LD model path with an absolute path
        config['transform-jsonld']['model'] = get_file_path(
            [config['conf-dir'], config['transform-jsonld']['model']])

    if 'load_elasticsearch' in options or 'etl_es' in options:
        load_elasticsearch = config['load-elasticsearch']

        # CLI selected list of document types
        selected_document_types = None
        if 'document_types' in options and options['document_types']:
            selected_document_types = set(options['document_types'].split(','))
        load_elasticsearch['document-types'] = selected_document_types

        elasticsearch_config = load_elasticsearch['config']

        load_elasticsearch['index-template'] = options.get(
            'index_template') or elasticsearch_config['index-template']

        load_elasticsearch['url'] = '{}:{}'.format(
            options['host'] or elasticsearch_config['host'], options['port']
            or elasticsearch_config['port'])
    return config