def map_to_aggregator(collection, mapping, aggregator): table = get_entity(mapping.table_id) if table is None: table = aggregator.get(mapping.table_id) if table is None: raise RuntimeError("Table cannot be found: %s" % mapping.table_id) config = {"csv_url": _get_table_csv_link(table), "entities": mapping.query} mapper = model.make_mapping(config, key_prefix=collection.foreign_id) origin = mapping_origin(mapping.id) aggregator.delete(origin=origin) writer = aggregator.bulk() idx = 0 for idx, record in enumerate(mapper.source.records, 1): if idx > 0 and idx % 1000 == 0: log.info("[%s] Mapped %s rows ...", mapping.id, idx) for entity in mapper.map(record).values(): entity.context = mapping.get_proxy_context() if entity.schema.is_a("Thing"): entity.add("proof", mapping.table_id) entity = collection.ns.apply(entity) entity = remove_checksums(entity) writer.put(entity, fragment=idx, origin=origin) if mapping.entityset is not None: EntitySetItem.save( mapping.entityset, entity.id, collection_id=collection.id, added_by_id=mapping.role_id, ) writer.flush() log.info("[%s] Mapping done (%s rows)", mapping.id, idx)
def bulk_write(collection, entities, safe=False, role_id=None, mutable=True, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: entity = model.get_proxy(data, cleaned=False) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if safe: entity = remove_checksums(entity) entity.context = {"role_id": role_id, "mutable": mutable} for field in ("created_at", "updated_at"): timestamp = data.get(field) if timestamp is not None: dt = registry.date.to_datetime(timestamp) if dt is not None: entity.context[field] = dt.isoformat() writer.put(entity, origin="bulk") if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. now = datetime.utcnow().isoformat() aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if not unsafe: entity = remove_checksums(entity) entity.context = { 'role_id': role_id, 'created_at': now, 'updated_at': now, } writer.put(entity, origin='bulk') if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def map_to_aggregator(collection, mapping, aggregator): table = get_entity(mapping.table_id) if table is None: table = aggregator.get(mapping.table_id) if table is None: raise RuntimeError("Table cannot be found: %s" % mapping.table_id) config = { 'csv_url': _get_table_csv_link(table), 'entities': mapping.query } mapper = model.make_mapping(config, key_prefix=collection.foreign_id) origin = mapping_origin(mapping.id) aggregator.delete(origin=origin) writer = aggregator.bulk() for idx, record in enumerate(mapper.source.records, 1): if idx > 0 and idx % 1000 == 0: log.info("[%s] Mapped %s rows ...", mapping.id, idx) for entity in mapper.map(record).values(): entity.context = mapping.get_proxy_context() entity.context['mutable'] = True if entity.schema.is_a('Thing'): entity.add('proof', mapping.table_id) entity = collection.ns.apply(entity) entity = remove_checksums(entity) writer.put(entity, fragment=idx, origin=origin) writer.flush() log.info("[%s] Mapping done (%s rows)", mapping.id, idx)
def _generate(): for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) if not unsafe: entity = remove_checksums(entity) yield _process_entity(entity)
def test_remove_checksums(self): proxy = model.get_proxy( { "id": "banana", "schema": "Document", "properties": {"contentHash": ["banana"], "title": ["foo"]}, } ) proxy = remove_checksums(proxy) assert not proxy.has("contentHash") assert proxy.has("title")
def test_remove_checksums(self): proxy = model.get_proxy({ 'id': 'banana', 'schema': 'Document', 'properties': { 'contentHash': ['banana'], 'title': ['foo'] } }) proxy = remove_checksums(proxy) assert not proxy.has('contentHash') assert proxy.has('title')
def bulk_write(collection, entities, safe=False, role_id=None, mutable=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. aggregator = get_aggregator(collection) writer = aggregator.bulk() for data in entities: entity = model.get_proxy(data, cleaned=False) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if safe: entity = remove_checksums(entity) entity.context = {"role_id": role_id, "mutable": mutable} for field, func in (("created_at", min), ("updated_at", max)): ts = func(ensure_list(data.get(field)), default=None) dt = registry.date.to_datetime(ts) if dt is not None: entity.context[field] = dt.isoformat() writer.put(entity, origin="bulk") yield entity.id writer.flush()