def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) # Update collection stats index_collection(collection)
def bulk_write(collection, items): """Write a set of entities - given as raw dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data") entity = model.get_proxy(item) if entity.id is None: raise InvalidData("No ID for entity") if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} if len(entities): index.index_bulk(collection.id, entities)
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return mapping = model.make_mapping(query, key_prefix=collection.foreign_id) entities = {} total = 0 for idx, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity_id = entity.get('id') if entity_id is None: continue # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. base = entities.get(entity_id, {}) entities[entity_id] = merge_data(entity, base) total += 1 if idx % 1000 == 0: log.info("[%s] Loaded %s records, %s entities...", collection.foreign_id, idx, total) if len(entities) >= BULK_PAGE: index_bulk(collection, entities, chunk_size=BULK_PAGE) entities = {} if len(entities): index_bulk(collection, entities, chunk_size=BULK_PAGE) # Update collection stats index_collection(collection)
def bulk_write(collection, items, merge=True): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) entity = namespace.apply(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)
def index_aggregate(stage, collection, sync=False, entity_ids=None, mapping_id=None): """Project the contents of the collections aggregator into the index.""" entities = _fetch_entities(stage, collection, entity_ids=entity_ids) entities = (_process_entity(e, sync=sync) for e in entities) extra = {'job_id': stage.job.id, 'mapping_id': mapping_id} index_bulk(collection, entities, extra, sync=sync) refresh_collection(collection.id, sync=sync)
def index_aggregator(collection, aggregator, entity_ids=None, sync=False): def _generate(): idx = 0 entities = aggregator.iterate(entity_id=entity_ids) for idx, proxy in enumerate(entities): if idx > 0 and idx % 1000 == 0: log.debug("[%s] Index: %s...", collection, idx) yield proxy log.debug("[%s] Indexed %s entities", collection, idx) entities_index.index_bulk(collection, _generate(), sync=sync) aggregator.close()
def bulk_write(collection, entities, job_id=None, unsafe=False): """Write a set of entities - given as dicts - to the index.""" def _generate(): for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if not unsafe: entity = remove_checksums(entity) yield _process_entity(entity) index_bulk(collection, _generate(), job_id=job_id) refresh_collection(collection.id)
def bulk_write(collection, entities, job_id=None, unsafe=False): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. def _generate(): for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) if not unsafe: entity = remove_checksums(entity) yield _process_entity(entity) index_bulk(collection, _generate(), {'job_id': job_id}) refresh_collection(collection.id)
def index_entities(stage, collection, iterable, sync=False): entities = [] for entity in iterable: if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) tag_entity(entity) entities.append(entity) if len(entities) >= BULK_PAGE: stage.report_finished(len(entities)) index_bulk(collection, entities, job_id=stage.job.id, sync=sync) entities = [] if len(entities): stage.report_finished(len(entities)) index_bulk(collection, entities, job_id=stage.job.id, sync=sync) refresh_collection(collection)
def index_entities(collection, iterable, sync=False): queue = get_queue(collection, OP_INDEX) queue.progress.mark_pending(len(iterable)) entities = [] for entity in iterable: if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) tag_entity(entity) entities.append(entity) if len(entities) >= BULK_PAGE: queue.progress.mark_finished(len(entities)) index_bulk(collection, entities, sync=sync) entities = [] if len(entities): queue.progress.mark_finished(len(entities)) index_bulk(collection, entities, sync=sync) refresh_collection(collection)
def load_rows(query, rows): """Load a single batch of QUEUE_PAGE rows from the given query.""" entities = {} links = [] for row in rows: entity_map = {} for entity in query.entities: data = entity.to_index(row) if data is not None: entity_map[entity.name] = data entities[data['id']] = data for link in query.links: for inverted in [False, True]: data = link.to_index(row, entity_map, inverted=inverted) if data is not None: links.append(data) index_bulk(entities, links) log.info("[%s] Indexed %s rows as %s entities, %s links...", query.collection.foreign_id, len(rows), len(entities), len(links))
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) refresh_collection(collection)
def bulk_write(collection, items, merge=True, unsafe=False): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) if not unsafe: entity = namespace.apply(entity) entity = remove_checksums(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)
def index_aggregate(stage, collection, entity_id=None, sync=False): """Project the contents of the collections aggregator into the index.""" entities = _fetch_entities(stage, collection, entity_id=entity_id) entities = (_process_entity(e, sync=sync) for e in entities) index_bulk(collection, entities, job_id=stage.job.id) refresh_collection(collection.id)