def index_collections(): cq = db.session.query(Collection) cq = cq.order_by(Collection.id.desc()) for collection in cq.all(): log.info("Index [%s]: %s", collection.foreign_id, collection.label) index.delete_collection(collection.id) index.index_collection(collection)
def index_collection(collection, entities=False, refresh=False): log.info("Index [%s]: %s", collection.id, collection.label) if entities and collection.deleted_at is None: index_collection_entities.delay(collection_id=collection.id) if refresh: refresh_collection(collection.id) index.index_collection(collection)
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return mapping = model.make_mapping(query, key_prefix=collection.foreign_id) entities = {} total = 0 for idx, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity_id = entity.get('id') if entity_id is None: continue # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. base = entities.get(entity_id, {}) entities[entity_id] = merge_data(entity, base) total += 1 if idx % 1000 == 0: log.info("[%s] Loaded %s records, %s entities...", collection.foreign_id, idx, total) if len(entities) >= BULK_PAGE: index_bulk(collection, entities, chunk_size=BULK_PAGE) entities = {} if len(entities): index_bulk(collection, entities, chunk_size=BULK_PAGE) # Update collection stats index_collection(collection)
def process_document(document): """Perform post-ingest tasks like analysis and indexing.""" analyze_document(document) index_document(document) index_records(document) if document.collection.casefile: index_collection(document.collection)
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) # Update collection stats index_collection(collection)
def bulk_load(config): """Bulk load entities from a CSV file or SQL database. This is done by mapping the rows in the source data to entities and links which can be understood by the entity index. """ for foreign_id, data in config.items(): collection = Collection.by_foreign_id(foreign_id) if collection is None: collection = Collection.create({ 'foreign_id': foreign_id, 'label': data.get('label') or foreign_id, 'summary': data.get('summary'), 'category': data.get('category'), 'managed': True, }) for role_fk in dict_list(data, 'roles', 'role'): role = Role.by_foreign_id(role_fk) if role is not None: Permission.grant(collection, role, True, False) else: log.warning("Could not find role: %s", role_fk) db.session.commit() index_collection(collection) for query in dict_list(data, 'queries', 'query'): bulk_load_query(collection, query)
def compute_collection(collection, sync=False): key = cache.object_key(Collection, collection.id, 'stats') if cache.get(key) and not sync: return cache.set(key, 'computed', expires=cache.EXPIRE - 60) log.info("Collection [%s] changed, computing...", collection.id) index.update_collection_stats(collection.id) index.index_collection(collection, sync=sync)
def update_collection(collection): """Create or update a collection.""" if collection.deleted_at is not None: index_delete(collection.id) return log.info("Updating: %r", collection) index_collection(collection)
def compute_collection(collection, force=False, sync=False): key = cache.object_key(Collection, collection.id, "stats") if cache.get(key) is not None and not force: return refresh_collection(collection.id) log.info("[%s] Computing statistics...", collection) index.update_collection_stats(collection.id) cache.set(key, "computed", expires=cache.EXPIRE) index.index_collection(collection, sync=sync)
def compute_collection(collection, force=False, sync=False): key = cache.object_key(Collection, collection.id, "stats") if cache.get(key) is not None and not force: return refresh_collection(collection.id) log.info("[%s] Computing statistics...", collection) index.update_collection_stats(collection.id) cache.set(key, datetime.utcnow().isoformat()) index.index_collection(collection, sync=sync)
def update_entity_full(entity_id): """Perform update operations on entities.""" query = db.session.query(Entity).filter(Entity.id == entity_id) entity = query.first() if entity is None: log.error("No entity with ID: %r", entity_id) return Alert.dedupe(entity.id) index_entity(entity) index_collection(entity.collection)
def index_collections(entities=False, refresh=False): q = Collection.all(deleted=True) q = q.order_by(Collection.updated_at.desc()) for collection in q: log.info("Index [%s]: %s", collection.id, collection.label) if entities and collection.deleted_at is None: index_collection_entities.delay(collection_id=collection.id) if refresh: refresh_collection(collection.id, sync=False) index.index_collection(collection)
def reindex_entities(block=5000): cq = db.session.query(Collection) for collection in cq.yield_per(block): log.info("Indexing entities in: %r", collection) eq = db.session.query(Entity) eq = eq.filter(Entity.collection == collection) for entity in eq.yield_per(block): # Use the one that's already loaded: entity.collection = collection index_entity(entity) index_collection(collection)
def create_collection(data, role=None): role = role or Role.load_cli_user() created_at = datetime.utcnow() collection = Collection.create(data, role=role, created_at=created_at) if collection.created_at == created_at: publish(Events.CREATE_COLLECTION, actor_id=role.id, params={'collection': collection}) db.session.commit() index.index_collection(collection) return collection
def index_aggregate(queue, collection, sync=False): """Project the contents of the collections aggregator into the index.""" aggregator = get_aggregator(collection) try: index_entities(collection, aggregator, sync=sync) refresh_collection(collection.id, sync=sync) index_collection(collection, sync=sync) log.info("Aggregate indexed: %r", collection) finally: aggregator.close() queue.remove()
def update_collection(collection): """Create or update a collection.""" if collection.deleted_at is not None: index_delete(collection.id) return collection.updated_at = datetime.utcnow() db.session.add(collection) db.session.commit() log.info("Updating: %r", collection) index_collection(collection) flush_index()
def create_collection(foreign_id, data, role=None): role = role or Role.load_cli_user() collection = Collection.by_foreign_id(foreign_id) if collection is None: data['foreign_id'] = foreign_id collection = Collection.create(data, role=role) else: languages = ensure_list(data.get('languages')) if len(languages): collection.languages = languages db.session.commit() index.index_collection(collection) return collection
def update_permission(role, collection, read, write): """Update a roles permission to access a given collection.""" pre = Permission.by_collection_role(collection, role) post = Permission.grant(collection, role, read, write) db.session.commit() update_roles(collection) index_collection(collection) notify_role_template(role, collection.label, 'email/permission.html', url='%scollections/%s' % (app_url, collection.id), pre=pre, post=post, collection=collection) return post
def update(foreign_id=None, index=False, process=False, reset=False): """Re-index all the collections and entities.""" update_roles() q = Collection.all(deleted=True) if foreign_id is not None: q = [get_collection(foreign_id)] for collection in q: if reset: reset_collection(collection, sync=True) refresh_collection(collection.id) index_collection(collection) if collection.deleted_at is not None: continue if index or process: payload = {'ingest': process} queue_task(collection, OP_PROCESS, payload=payload)
def bulk_load(config): """Bulk load entities from a CSV file or SQL database. This is done by mapping the rows in the source data to entities and links which can be understood by the entity index. """ for foreign_id, data in config.items(): collection = Collection.by_foreign_id(foreign_id) if collection is None: data['foreign_id'] = foreign_id data['label'] = data.get('label', foreign_id) collection = Collection.create(data) db.session.commit() index_collection(collection) for query in dict_list(data, 'queries', 'query'): bulk_load_query.apply_async([collection.id, query], priority=6)
def ingest(document_id, role_id=None): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return get_manager().ingest_document(document, role_id=role_id) if document.collection.casefile: index_collection(document.collection) params = { 'document': document, 'collection': document.collection } publish(Events.INGEST_DOCUMENT, actor_id=role_id, params=params)
def create_collection(data, role=None, sync=False): role = role or Role.load_cli_user() created_at = datetime.utcnow() collection = Collection.create(data, creator=role, created_at=created_at) publish(Events.CREATE_COLLECTION, params={'collection': collection}, actor_id=role.id) db.session.commit() Authz.flush() refresh_collection(collection.id) return index.index_collection(collection, sync=sync)
def create_collection(data, role=None, sync=False): role = role or Role.load_cli_user() created_at = datetime.utcnow() collection = Collection.create(data, role=role, created_at=created_at) if collection.created_at == created_at: publish(Events.CREATE_COLLECTION, actor_id=role.id, params={'collection': collection}) db.session.commit() Authz.flush() refresh_collection(collection.id) return index.index_collection(collection, sync=sync)
def bulk_load(config): """Bulk load entities from a CSV file or SQL database. This is done by mapping the rows in the source data to entities and links which can be understood by the entity index. """ for foreign_id, data in config.items(): collection = Collection.by_foreign_id(foreign_id) if collection is None: collection = Collection.create({ 'foreign_id': foreign_id, 'label': data.get('label') or foreign_id, 'summary': data.get('summary'), 'category': data.get('category'), 'managed': True, }) db.session.commit() index_collection(collection) for query in dict_list(data, 'queries', 'query'): bulk_load_query.delay(collection.id, query)
def update_collection(collection): """Create or update a collection.""" log.info("Updating: %r", collection) if collection.deleted_at is not None: index.delete_collection(collection.id) return # re-process entities process_entities.delay(collection_id=collection.id) if collection.casefile: xref_collection.apply_async([collection.id], priority=2) return index.index_collection(collection)
def update_collection(collection, roles=False): """Create or update a collection.""" if collection.deleted_at is not None: index_delete(collection.id) return collection.updated_at = datetime.utcnow() db.session.add(collection) db.session.commit() log.info("Updating: %r", collection) index_collection(collection) if roles: update_roles(collection) if not collection.managed: xref_collection.apply_async([collection.id], priority=2) eq = db.session.query(Entity.id) eq = eq.filter(Entity.collection_id == collection.id) for entity in eq.all(): update_entity_full.apply_async([entity.id], priority=2) flush_index()
def update_collection(collection): """Create or update a collection.""" log.info("Updating: %r", collection) if collection.deleted_at is not None: index_delete(collection.id) return if collection.casefile: xref_collection.apply_async([collection.id], priority=2) # TODO: rebuild dossiers eq = db.session.query(Entity.id) eq = eq.filter(Entity.collection_id == collection.id) for entity in eq: update_entity_full.apply_async([entity.id], priority=1) return index_collection(collection)
def update_collection(collection, sync=False): """Update a collection and re-index.""" Authz.flush() refresh_collection(collection.id) return index.index_collection(collection, sync=sync)
def update_collection(collection, sync=False): """Create or update a collection.""" Authz.flush() refresh_collection(collection.id) return index.index_collection(collection, sync=sync)
def index_collections(): for collection in Collection.all(deleted=True): log.info("Index [%s]: %s", collection.id, collection.label) index.index_collection(collection)
def index_collections(): for collection in Collection.all(deleted=True): index.index_collection(collection)