Beispiel #1
0
def index_collections():
    cq = db.session.query(Collection)
    cq = cq.order_by(Collection.id.desc())
    for collection in cq.all():
        log.info("Index [%s]: %s", collection.foreign_id, collection.label)
        index.delete_collection(collection.id)
        index.index_collection(collection)
Beispiel #2
0
def index_collection(collection, entities=False, refresh=False):
    log.info("Index [%s]: %s", collection.id, collection.label)
    if entities and collection.deleted_at is None:
        index_collection_entities.delay(collection_id=collection.id)
    if refresh:
        refresh_collection(collection.id)
    index.index_collection(collection)
Beispiel #3
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    entities = {}
    total = 0
    for idx, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity_id = entity.get('id')
            if entity_id is None:
                continue
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            base = entities.get(entity_id, {})
            entities[entity_id] = merge_data(entity, base)
            total += 1

        if idx % 1000 == 0:
            log.info("[%s] Loaded %s records, %s entities...",
                     collection.foreign_id, idx, total)

        if len(entities) >= BULK_PAGE:
            index_bulk(collection, entities, chunk_size=BULK_PAGE)
            entities = {}

    if len(entities):
        index_bulk(collection, entities, chunk_size=BULK_PAGE)

    # Update collection stats
    index_collection(collection)
Beispiel #4
0
def process_document(document):
    """Perform post-ingest tasks like analysis and indexing."""
    analyze_document(document)
    index_document(document)
    index_records(document)
    if document.collection.casefile:
        index_collection(document.collection)
Beispiel #5
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, records_index, records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    # Update collection stats
    index_collection(collection)
Beispiel #6
0
def bulk_load(config):
    """Bulk load entities from a CSV file or SQL database.

    This is done by mapping the rows in the source data to entities and links
    which can be understood by the entity index.
    """
    for foreign_id, data in config.items():
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            collection = Collection.create({
                'foreign_id': foreign_id,
                'label': data.get('label') or foreign_id,
                'summary': data.get('summary'),
                'category': data.get('category'),
                'managed': True,
            })

        for role_fk in dict_list(data, 'roles', 'role'):
            role = Role.by_foreign_id(role_fk)
            if role is not None:
                Permission.grant(collection, role, True, False)
            else:
                log.warning("Could not find role: %s", role_fk)

        db.session.commit()
        index_collection(collection)

        for query in dict_list(data, 'queries', 'query'):
            bulk_load_query(collection, query)
Beispiel #7
0
def compute_collection(collection, sync=False):
    key = cache.object_key(Collection, collection.id, 'stats')
    if cache.get(key) and not sync:
        return
    cache.set(key, 'computed', expires=cache.EXPIRE - 60)
    log.info("Collection [%s] changed, computing...", collection.id)
    index.update_collection_stats(collection.id)
    index.index_collection(collection, sync=sync)
Beispiel #8
0
def update_collection(collection):
    """Create or update a collection."""
    if collection.deleted_at is not None:
        index_delete(collection.id)
        return

    log.info("Updating: %r", collection)
    index_collection(collection)
Beispiel #9
0
def compute_collection(collection, force=False, sync=False):
    key = cache.object_key(Collection, collection.id, "stats")
    if cache.get(key) is not None and not force:
        return
    refresh_collection(collection.id)
    log.info("[%s] Computing statistics...", collection)
    index.update_collection_stats(collection.id)
    cache.set(key, "computed", expires=cache.EXPIRE)
    index.index_collection(collection, sync=sync)
Beispiel #10
0
def compute_collection(collection, force=False, sync=False):
    key = cache.object_key(Collection, collection.id, "stats")
    if cache.get(key) is not None and not force:
        return
    refresh_collection(collection.id)
    log.info("[%s] Computing statistics...", collection)
    index.update_collection_stats(collection.id)
    cache.set(key, datetime.utcnow().isoformat())
    index.index_collection(collection, sync=sync)
Beispiel #11
0
def update_entity_full(entity_id):
    """Perform update operations on entities."""
    query = db.session.query(Entity).filter(Entity.id == entity_id)
    entity = query.first()
    if entity is None:
        log.error("No entity with ID: %r", entity_id)
        return
    Alert.dedupe(entity.id)
    index_entity(entity)
    index_collection(entity.collection)
Beispiel #12
0
def index_collections(entities=False, refresh=False):
    q = Collection.all(deleted=True)
    q = q.order_by(Collection.updated_at.desc())
    for collection in q:
        log.info("Index [%s]: %s", collection.id, collection.label)
        if entities and collection.deleted_at is None:
            index_collection_entities.delay(collection_id=collection.id)
        if refresh:
            refresh_collection(collection.id, sync=False)
        index.index_collection(collection)
Beispiel #13
0
def reindex_entities(block=5000):
    cq = db.session.query(Collection)
    for collection in cq.yield_per(block):
        log.info("Indexing entities in: %r", collection)
        eq = db.session.query(Entity)
        eq = eq.filter(Entity.collection == collection)
        for entity in eq.yield_per(block):
            # Use the one that's already loaded:
            entity.collection = collection
            index_entity(entity)
        index_collection(collection)
Beispiel #14
0
def create_collection(data, role=None):
    role = role or Role.load_cli_user()
    created_at = datetime.utcnow()
    collection = Collection.create(data, role=role, created_at=created_at)
    if collection.created_at == created_at:
        publish(Events.CREATE_COLLECTION,
                actor_id=role.id,
                params={'collection': collection})
    db.session.commit()
    index.index_collection(collection)
    return collection
Beispiel #15
0
def index_aggregate(queue, collection, sync=False):
    """Project the contents of the collections aggregator into the index."""
    aggregator = get_aggregator(collection)
    try:
        index_entities(collection, aggregator, sync=sync)
        refresh_collection(collection.id, sync=sync)
        index_collection(collection, sync=sync)
        log.info("Aggregate indexed: %r", collection)
    finally:
        aggregator.close()
        queue.remove()
Beispiel #16
0
def update_collection(collection):
    """Create or update a collection."""
    if collection.deleted_at is not None:
        index_delete(collection.id)
        return

    collection.updated_at = datetime.utcnow()
    db.session.add(collection)
    db.session.commit()

    log.info("Updating: %r", collection)
    index_collection(collection)
    flush_index()
Beispiel #17
0
def create_collection(foreign_id, data, role=None):
    role = role or Role.load_cli_user()
    collection = Collection.by_foreign_id(foreign_id)
    if collection is None:
        data['foreign_id'] = foreign_id
        collection = Collection.create(data, role=role)
    else:
        languages = ensure_list(data.get('languages'))
        if len(languages):
            collection.languages = languages
    db.session.commit()
    index.index_collection(collection)
    return collection
Beispiel #18
0
def update_permission(role, collection, read, write):
    """Update a roles permission to access a given collection."""
    pre = Permission.by_collection_role(collection, role)
    post = Permission.grant(collection, role, read, write)
    db.session.commit()
    update_roles(collection)
    index_collection(collection)

    notify_role_template(role,
                         collection.label,
                         'email/permission.html',
                         url='%scollections/%s' % (app_url, collection.id),
                         pre=pre,
                         post=post,
                         collection=collection)
    return post
Beispiel #19
0
def update(foreign_id=None, index=False, process=False, reset=False):
    """Re-index all the collections and entities."""
    update_roles()
    q = Collection.all(deleted=True)
    if foreign_id is not None:
        q = [get_collection(foreign_id)]
    for collection in q:
        if reset:
            reset_collection(collection, sync=True)
        refresh_collection(collection.id)
        index_collection(collection)
        if collection.deleted_at is not None:
            continue
        if index or process:
            payload = {'ingest': process}
            queue_task(collection, OP_PROCESS, payload=payload)
Beispiel #20
0
def bulk_load(config):
    """Bulk load entities from a CSV file or SQL database.

    This is done by mapping the rows in the source data to entities and links
    which can be understood by the entity index.
    """
    for foreign_id, data in config.items():
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            data['foreign_id'] = foreign_id
            data['label'] = data.get('label', foreign_id)
            collection = Collection.create(data)

        db.session.commit()
        index_collection(collection)
        for query in dict_list(data, 'queries', 'query'):
            bulk_load_query.apply_async([collection.id, query], priority=6)
Beispiel #21
0
def ingest(document_id, role_id=None):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    get_manager().ingest_document(document, role_id=role_id)

    if document.collection.casefile:
        index_collection(document.collection)
        params = {
            'document': document,
            'collection': document.collection
        }
        publish(Events.INGEST_DOCUMENT,
                actor_id=role_id,
                params=params)
Beispiel #22
0
def create_collection(data, role=None, sync=False):
    role = role or Role.load_cli_user()
    created_at = datetime.utcnow()
    collection = Collection.create(data, creator=role, created_at=created_at)
    publish(Events.CREATE_COLLECTION,
            params={'collection': collection},
            actor_id=role.id)
    db.session.commit()
    Authz.flush()
    refresh_collection(collection.id)
    return index.index_collection(collection, sync=sync)
Beispiel #23
0
def create_collection(data, role=None, sync=False):
    role = role or Role.load_cli_user()
    created_at = datetime.utcnow()
    collection = Collection.create(data, role=role, created_at=created_at)
    if collection.created_at == created_at:
        publish(Events.CREATE_COLLECTION,
                actor_id=role.id,
                params={'collection': collection})
    db.session.commit()
    Authz.flush()
    refresh_collection(collection.id)
    return index.index_collection(collection, sync=sync)
Beispiel #24
0
def bulk_load(config):
    """Bulk load entities from a CSV file or SQL database.

    This is done by mapping the rows in the source data to entities and links
    which can be understood by the entity index.
    """
    for foreign_id, data in config.items():
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            collection = Collection.create({
                'foreign_id': foreign_id,
                'label': data.get('label') or foreign_id,
                'summary': data.get('summary'),
                'category': data.get('category'),
                'managed': True,
            })

        db.session.commit()
        index_collection(collection)

        for query in dict_list(data, 'queries', 'query'):
            bulk_load_query.delay(collection.id, query)
Beispiel #25
0
def update_collection(collection):
    """Create or update a collection."""
    log.info("Updating: %r", collection)
    if collection.deleted_at is not None:
        index.delete_collection(collection.id)
        return

    # re-process entities
    process_entities.delay(collection_id=collection.id)

    if collection.casefile:
        xref_collection.apply_async([collection.id], priority=2)
    return index.index_collection(collection)
Beispiel #26
0
def update_collection(collection, roles=False):
    """Create or update a collection."""
    if collection.deleted_at is not None:
        index_delete(collection.id)
        return

    collection.updated_at = datetime.utcnow()
    db.session.add(collection)
    db.session.commit()

    log.info("Updating: %r", collection)
    index_collection(collection)
    if roles:
        update_roles(collection)

    if not collection.managed:
        xref_collection.apply_async([collection.id], priority=2)

    eq = db.session.query(Entity.id)
    eq = eq.filter(Entity.collection_id == collection.id)
    for entity in eq.all():
        update_entity_full.apply_async([entity.id], priority=2)

    flush_index()
Beispiel #27
0
def update_collection(collection):
    """Create or update a collection."""
    log.info("Updating: %r", collection)

    if collection.deleted_at is not None:
        index_delete(collection.id)
        return

    if collection.casefile:
        xref_collection.apply_async([collection.id], priority=2)
        # TODO: rebuild dossiers

    eq = db.session.query(Entity.id)
    eq = eq.filter(Entity.collection_id == collection.id)
    for entity in eq:
        update_entity_full.apply_async([entity.id], priority=1)

    return index_collection(collection)
Beispiel #28
0
def update_collection(collection, sync=False):
    """Update a collection and re-index."""
    Authz.flush()
    refresh_collection(collection.id)
    return index.index_collection(collection, sync=sync)
Beispiel #29
0
def update_collection(collection, sync=False):
    """Create or update a collection."""
    Authz.flush()
    refresh_collection(collection.id)
    return index.index_collection(collection, sync=sync)
Beispiel #30
0
def index_collections():
    for collection in Collection.all(deleted=True):
        log.info("Index [%s]: %s", collection.id, collection.label)
        index.index_collection(collection)
Beispiel #31
0
def index_collections():
    for collection in Collection.all(deleted=True):
        index.index_collection(collection)
Beispiel #32
0
def update_collection(collection, sync=False):
    """Create or update a collection."""
    Authz.flush()
    refresh_collection(collection.id)
    return index.index_collection(collection, sync=sync)