def load_all_data_with_uri(source, source_json_dir, transform_config, pool, logger): logger.debug("Loading BrAPI JSON from {}...".format(source_json_dir)) entity_files = list(list_entity_files(source_json_dir)) if transform_config.get('restricted-documents'): document_configs = transform_config['documents'] required_entities = get_required_entities(document_configs, source_json_dir) entity_files = list(filter(compose(required_entities.__contains__, first), entity_files)) logger.debug("Loading entities: {}".format(', '.join(list(map(first, entity_files))))) # Load stream of file lines all_lines = itertools.chain.from_iterable(map(load_entity_lines, entity_files)) # Parse JSON to python objects all_data = pool.imap_unordered(parse_data, all_lines, CHUNK_SIZE) # Generate URIs (and create dict from entity/id to URI) uri_map = dict() data_list = list() for entity_name, data in all_data: data_id, data_uri = generate_uri_global_id(source, entity_name, data) uri_map[(entity_name, data_id)] = data_uri uri_map[(entity_name, get_identifier(entity_name, data))] = data_uri if is_checkpoint(len(data_list)): logger.debug("checkpoint: {} BrAPI objects loaded".format(len(data_list))) data_list.append(data) logger.debug("Loaded total of {} BrAPI objects.".format(len(data_list))) # Replace all entity links using global ids (ex: studyDbId: 1 => studyDbId: urn:source%2Fstudy%2F1) generate_links = partial(generate_global_id_links, source, uri_map) return pool.imap_unordered(generate_links, data_list, CHUNK_SIZE)
def get_required_entities(document_configs, source_json_dir): """ Returns set of required entities for all documents in configuration """ source_entities = set( remove_none(map(lambda d: d.get('source-entity'), document_configs))) def collect_entities(parsed_template): if is_list_like(parsed_template): return set(flatten_it(map(collect_entities, parsed_template))) if isinstance(parsed_template, dict): if '{lark}' in parsed_template: entities = set() for object_path in as_list( resolve_path(parsed_template, ['start', 'object_path'])): fields = resolve_path(object_path, ['field_path', 'FIELD']) match = re.search("^(\w+)DbId(s?)$", fields[-1]) if match: entities.add(match.groups()[0]) return entities return set( flatten_it(map(collect_entities, parsed_template.values()))) return set() document_transforms = remove_none( map(lambda d: d.get('document-transform'), document_configs)) required_entities = source_entities.union( flatten_it(map(collect_entities, document_transforms))) if source_json_dir: all_files = list_entity_files(source_json_dir) filtered_files = list( filter(lambda x: x[0] in source_entities, all_files)) for entity_name, file_path in filtered_files: with open(file_path, 'r') as file: line = file.readline() if line: data = json.loads(line) links = get_entity_links(data, 'DbId', 'PUI') entity_names = set(map(first, links)) required_entities.update(entity_names) return required_entities
def load_source(source, config, source_bulk_dir, log_dir): """ Full Elasticsearch documents indexing """ source_name = source['schema:identifier'] action = 'load-elasticsearch-' + source_name log_file = get_file_path([log_dir, action], ext='.log', recreate=True) logger = create_logger(source_name, log_file, config['verbose']) load_config = config['load-elasticsearch'] es_client = init_es_client(load_config['url'], logger) logger.info("Loading '{}' into elasticsearch '{}'...".format(source_bulk_dir, load_config['url'])) try: if not os.path.exists(source_bulk_dir): raise FileNotFoundError( 'No such file or directory: \'{}\'.\n' 'Please make sure you have run the BrAPI extraction and Elasticsearch document transformation' ' before trying to launch the transformation process.' .format(source_bulk_dir)) bulk_files = list(list_entity_files(source_bulk_dir)) all_document_types = set(map(first, bulk_files)) document_types = load_config.get('document-types') or all_document_types document_types = document_types.intersection(all_document_types) index_by_document = dict() logger.info("Preparing index with template mapping...") timestamp = int(time.time()) for document_type in document_types: base_index_name = replace_template( load_config['index-template'], {'source': source['schema:identifier'], 'documentType': document_type} ).lower() create_template(es_client, load_config, document_type, base_index_name, logger) index_name = base_index_name + '-d' + str(timestamp) create_index(es_client, index_name, logger) index_by_document[document_type] = base_index_name, index_name logger.info("Bulk indexing...") for document_type, file_path in bulk_files: if document_type in index_by_document: base_index_name, index_name = index_by_document[document_type] bulk_index(es_client, index_name, file_path, logger) logger.info("Creating index aliases and deleting old indices...") for document_type, (base_index_name, index_name) in index_by_document.items(): create_alias(es_client, index_name, base_index_name, logger) new_index, *old_indices = get_indices(es_client, base_index_name) for old_index in old_indices[1:]: delete_index(es_client, old_index, logger) logger.info("SUCCEEDED Loading {}.".format(source_name)) except Exception as e: logger.debug(traceback.format_exc()) logger.debug(getattr(e, 'long_message', '')) logger.info("FAILED Loading {} Elasticsearch documents.\n" "=> Check the logs ({}) for more details." .format(source_name, log_file))
def launch_etl(options, config): def handler(*_): sys.exit(0) signal.signal(signal.SIGINT, handler) default_index_template = config['load-elasticsearch']['index-template'] # Execute ETL actions based on CLI arguments: if 'extract' in options or 'etl_es' in options or 'etl_virtuoso' in options: etl.extract.brapi.main(config) if 'transform_elasticsearch' in options or 'etl_es' in options: transform_config = config['transform-elasticsearch'] # Restrict lis of generated document if requested input_doc_types = options.get('document_types') if input_doc_types: transform_config['restricted-documents'] = set( remove_empty(input_doc_types.split(','))) # Copy base jsonschema definitions into each document jsonschema validation_config = transform_config['validation'] base_definitions = validation_config['base-definitions'] for (document_type, document_schema) in validation_config['documents'].items(): document_schema['definitions'] = base_definitions # Run transform etl.transform.elasticsearch.main(config) if 'transform_jsonld' in options or 'transform_rdf' in options or 'etl_virtuoso' in options: # Replace JSON-LD context path with absolute path for (entity_name, entity) in config['transform-jsonld']['entities'].items(): if '@context' in entity: entity['@context'] = get_file_path( [config['conf-dir'], entity['@context']]) if not os.path.exists(entity['@context']): raise Exception( 'JSON-LD context file "{}" defined in "{}" does not exist' .format( entity['@context'], os.path.join(config['conf-dir'], 'transform-jsonld.json'))) # Replace JSON-LD model path with an absolute path config['transform-jsonld']['model'] = get_file_path( [config['conf-dir'], config['transform-jsonld']['model']]) etl.transform.jsonld.main(config) if 'transform_rdf' in options or 'etl_virtuoso' in options: etl.transform.rdf.main(config) if 'load_elasticsearch' in options or 'etl_es' in options: mapping_files = list_entity_files( os.path.join(config['conf-dir'], 'elasticsearch')) selected_document_types = None if 'document_types' in options and options['document_types']: selected_document_types = set(options['document_types'].split(',')) config['load-elasticsearch']['url'] = '{}:{}'.format( options['host'], options['port']) config['load-elasticsearch']['mappings'] = { document_type: file_path for document_type, file_path in mapping_files } config['load-elasticsearch']['index-template'] = options.get( 'index_template') or default_index_template config['load-elasticsearch'][ 'document-types'] = selected_document_types etl.load.elasticsearch.main(config) if 'load_virtuoso' in options or 'etl_virtuoso' in options: etl.load.virtuoso.main(config)