コード例 #1
0
ファイル: collections.py プロジェクト: atom-cmd/eskom-enquiry
def index_collections():
    cq = db.session.query(Collection)
    cq = cq.order_by(Collection.id.desc())
    for collection in cq.all():
        log.info("Index [%s]: %s", collection.foreign_id, collection.label)
        index.delete_collection(collection.id)
        index.index_collection(collection)
コード例 #2
0
def index_collection(collection, entities=False, refresh=False):
    log.info("Index [%s]: %s", collection.id, collection.label)
    if entities and collection.deleted_at is None:
        index_collection_entities.delay(collection_id=collection.id)
    if refresh:
        refresh_collection(collection.id)
    index.index_collection(collection)
コード例 #3
0
ファイル: entities.py プロジェクト: SiloGit/aleph
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    entities = {}
    total = 0
    for idx, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity_id = entity.get('id')
            if entity_id is None:
                continue
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            base = entities.get(entity_id, {})
            entities[entity_id] = merge_data(entity, base)
            total += 1

        if idx % 1000 == 0:
            log.info("[%s] Loaded %s records, %s entities...",
                     collection.foreign_id, idx, total)

        if len(entities) >= BULK_PAGE:
            index_bulk(collection, entities, chunk_size=BULK_PAGE)
            entities = {}

    if len(entities):
        index_bulk(collection, entities, chunk_size=BULK_PAGE)

    # Update collection stats
    index_collection(collection)
コード例 #4
0
def process_document(document):
    """Perform post-ingest tasks like analysis and indexing."""
    analyze_document(document)
    index_document(document)
    index_records(document)
    if document.collection.casefile:
        index_collection(document.collection)
コード例 #5
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, records_index, records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    # Update collection stats
    index_collection(collection)
コード例 #6
0
ファイル: entities.py プロジェクト: Ro9ueAdmin/aleph
def bulk_load(config):
    """Bulk load entities from a CSV file or SQL database.

    This is done by mapping the rows in the source data to entities and links
    which can be understood by the entity index.
    """
    for foreign_id, data in config.items():
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            collection = Collection.create({
                'foreign_id': foreign_id,
                'label': data.get('label') or foreign_id,
                'summary': data.get('summary'),
                'category': data.get('category'),
                'managed': True,
            })

        for role_fk in dict_list(data, 'roles', 'role'):
            role = Role.by_foreign_id(role_fk)
            if role is not None:
                Permission.grant(collection, role, True, False)
            else:
                log.warning("Could not find role: %s", role_fk)

        db.session.commit()
        index_collection(collection)

        for query in dict_list(data, 'queries', 'query'):
            bulk_load_query(collection, query)
コード例 #7
0
def compute_collection(collection, sync=False):
    key = cache.object_key(Collection, collection.id, 'stats')
    if cache.get(key) and not sync:
        return
    cache.set(key, 'computed', expires=cache.EXPIRE - 60)
    log.info("Collection [%s] changed, computing...", collection.id)
    index.update_collection_stats(collection.id)
    index.index_collection(collection, sync=sync)
コード例 #8
0
def update_collection(collection):
    """Create or update a collection."""
    if collection.deleted_at is not None:
        index_delete(collection.id)
        return

    log.info("Updating: %r", collection)
    index_collection(collection)
コード例 #9
0
ファイル: collections.py プロジェクト: wayne9qiu/aleph
def compute_collection(collection, force=False, sync=False):
    key = cache.object_key(Collection, collection.id, "stats")
    if cache.get(key) is not None and not force:
        return
    refresh_collection(collection.id)
    log.info("[%s] Computing statistics...", collection)
    index.update_collection_stats(collection.id)
    cache.set(key, "computed", expires=cache.EXPIRE)
    index.index_collection(collection, sync=sync)
コード例 #10
0
def compute_collection(collection, force=False, sync=False):
    key = cache.object_key(Collection, collection.id, "stats")
    if cache.get(key) is not None and not force:
        return
    refresh_collection(collection.id)
    log.info("[%s] Computing statistics...", collection)
    index.update_collection_stats(collection.id)
    cache.set(key, datetime.utcnow().isoformat())
    index.index_collection(collection, sync=sync)
コード例 #11
0
def update_entity_full(entity_id):
    """Perform update operations on entities."""
    query = db.session.query(Entity).filter(Entity.id == entity_id)
    entity = query.first()
    if entity is None:
        log.error("No entity with ID: %r", entity_id)
        return
    Alert.dedupe(entity.id)
    index_entity(entity)
    index_collection(entity.collection)
コード例 #12
0
def index_collections(entities=False, refresh=False):
    q = Collection.all(deleted=True)
    q = q.order_by(Collection.updated_at.desc())
    for collection in q:
        log.info("Index [%s]: %s", collection.id, collection.label)
        if entities and collection.deleted_at is None:
            index_collection_entities.delay(collection_id=collection.id)
        if refresh:
            refresh_collection(collection.id, sync=False)
        index.index_collection(collection)
コード例 #13
0
ファイル: entities.py プロジェクト: kaue-cauin/aleph
def reindex_entities(block=5000):
    cq = db.session.query(Collection)
    for collection in cq.yield_per(block):
        log.info("Indexing entities in: %r", collection)
        eq = db.session.query(Entity)
        eq = eq.filter(Entity.collection == collection)
        for entity in eq.yield_per(block):
            # Use the one that's already loaded:
            entity.collection = collection
            index_entity(entity)
        index_collection(collection)
コード例 #14
0
def create_collection(data, role=None):
    role = role or Role.load_cli_user()
    created_at = datetime.utcnow()
    collection = Collection.create(data, role=role, created_at=created_at)
    if collection.created_at == created_at:
        publish(Events.CREATE_COLLECTION,
                actor_id=role.id,
                params={'collection': collection})
    db.session.commit()
    index.index_collection(collection)
    return collection
コード例 #15
0
def index_aggregate(queue, collection, sync=False):
    """Project the contents of the collections aggregator into the index."""
    aggregator = get_aggregator(collection)
    try:
        index_entities(collection, aggregator, sync=sync)
        refresh_collection(collection.id, sync=sync)
        index_collection(collection, sync=sync)
        log.info("Aggregate indexed: %r", collection)
    finally:
        aggregator.close()
        queue.remove()
コード例 #16
0
def update_collection(collection):
    """Create or update a collection."""
    if collection.deleted_at is not None:
        index_delete(collection.id)
        return

    collection.updated_at = datetime.utcnow()
    db.session.add(collection)
    db.session.commit()

    log.info("Updating: %r", collection)
    index_collection(collection)
    flush_index()
コード例 #17
0
ファイル: collections.py プロジェクト: atom-cmd/eskom-enquiry
def create_collection(foreign_id, data, role=None):
    role = role or Role.load_cli_user()
    collection = Collection.by_foreign_id(foreign_id)
    if collection is None:
        data['foreign_id'] = foreign_id
        collection = Collection.create(data, role=role)
    else:
        languages = ensure_list(data.get('languages'))
        if len(languages):
            collection.languages = languages
    db.session.commit()
    index.index_collection(collection)
    return collection
コード例 #18
0
ファイル: permissions.py プロジェクト: kaue-cauin/aleph
def update_permission(role, collection, read, write):
    """Update a roles permission to access a given collection."""
    pre = Permission.by_collection_role(collection, role)
    post = Permission.grant(collection, role, read, write)
    db.session.commit()
    update_roles(collection)
    index_collection(collection)

    notify_role_template(role,
                         collection.label,
                         'email/permission.html',
                         url='%scollections/%s' % (app_url, collection.id),
                         pre=pre,
                         post=post,
                         collection=collection)
    return post
コード例 #19
0
ファイル: manage.py プロジェクト: wdsn/aleph
def update(foreign_id=None, index=False, process=False, reset=False):
    """Re-index all the collections and entities."""
    update_roles()
    q = Collection.all(deleted=True)
    if foreign_id is not None:
        q = [get_collection(foreign_id)]
    for collection in q:
        if reset:
            reset_collection(collection, sync=True)
        refresh_collection(collection.id)
        index_collection(collection)
        if collection.deleted_at is not None:
            continue
        if index or process:
            payload = {'ingest': process}
            queue_task(collection, OP_PROCESS, payload=payload)
コード例 #20
0
ファイル: entities.py プロジェクト: renesugar/aleph
def bulk_load(config):
    """Bulk load entities from a CSV file or SQL database.

    This is done by mapping the rows in the source data to entities and links
    which can be understood by the entity index.
    """
    for foreign_id, data in config.items():
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            data['foreign_id'] = foreign_id
            data['label'] = data.get('label', foreign_id)
            collection = Collection.create(data)

        db.session.commit()
        index_collection(collection)
        for query in dict_list(data, 'queries', 'query'):
            bulk_load_query.apply_async([collection.id, query], priority=6)
コード例 #21
0
def ingest(document_id, role_id=None):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    get_manager().ingest_document(document, role_id=role_id)

    if document.collection.casefile:
        index_collection(document.collection)
        params = {
            'document': document,
            'collection': document.collection
        }
        publish(Events.INGEST_DOCUMENT,
                actor_id=role_id,
                params=params)
コード例 #22
0
def create_collection(data, role=None, sync=False):
    role = role or Role.load_cli_user()
    created_at = datetime.utcnow()
    collection = Collection.create(data, creator=role, created_at=created_at)
    publish(Events.CREATE_COLLECTION,
            params={'collection': collection},
            actor_id=role.id)
    db.session.commit()
    Authz.flush()
    refresh_collection(collection.id)
    return index.index_collection(collection, sync=sync)
コード例 #23
0
ファイル: collections.py プロジェクト: pudo/aleph
def create_collection(data, role=None, sync=False):
    role = role or Role.load_cli_user()
    created_at = datetime.utcnow()
    collection = Collection.create(data, role=role, created_at=created_at)
    if collection.created_at == created_at:
        publish(Events.CREATE_COLLECTION,
                actor_id=role.id,
                params={'collection': collection})
    db.session.commit()
    Authz.flush()
    refresh_collection(collection.id)
    return index.index_collection(collection, sync=sync)
コード例 #24
0
ファイル: entities.py プロジェクト: SiloGit/aleph
def bulk_load(config):
    """Bulk load entities from a CSV file or SQL database.

    This is done by mapping the rows in the source data to entities and links
    which can be understood by the entity index.
    """
    for foreign_id, data in config.items():
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            collection = Collection.create({
                'foreign_id': foreign_id,
                'label': data.get('label') or foreign_id,
                'summary': data.get('summary'),
                'category': data.get('category'),
                'managed': True,
            })

        db.session.commit()
        index_collection(collection)

        for query in dict_list(data, 'queries', 'query'):
            bulk_load_query.delay(collection.id, query)
コード例 #25
0
ファイル: collections.py プロジェクト: atom-cmd/eskom-enquiry
def update_collection(collection):
    """Create or update a collection."""
    log.info("Updating: %r", collection)
    if collection.deleted_at is not None:
        index.delete_collection(collection.id)
        return

    # re-process entities
    process_entities.delay(collection_id=collection.id)

    if collection.casefile:
        xref_collection.apply_async([collection.id], priority=2)
    return index.index_collection(collection)
コード例 #26
0
ファイル: collections.py プロジェクト: jigsawsecurity/aleph
def update_collection(collection, roles=False):
    """Create or update a collection."""
    if collection.deleted_at is not None:
        index_delete(collection.id)
        return

    collection.updated_at = datetime.utcnow()
    db.session.add(collection)
    db.session.commit()

    log.info("Updating: %r", collection)
    index_collection(collection)
    if roles:
        update_roles(collection)

    if not collection.managed:
        xref_collection.apply_async([collection.id], priority=2)

    eq = db.session.query(Entity.id)
    eq = eq.filter(Entity.collection_id == collection.id)
    for entity in eq.all():
        update_entity_full.apply_async([entity.id], priority=2)

    flush_index()
コード例 #27
0
def update_collection(collection):
    """Create or update a collection."""
    log.info("Updating: %r", collection)

    if collection.deleted_at is not None:
        index_delete(collection.id)
        return

    if collection.casefile:
        xref_collection.apply_async([collection.id], priority=2)
        # TODO: rebuild dossiers

    eq = db.session.query(Entity.id)
    eq = eq.filter(Entity.collection_id == collection.id)
    for entity in eq:
        update_entity_full.apply_async([entity.id], priority=1)

    return index_collection(collection)
コード例 #28
0
def update_collection(collection, sync=False):
    """Update a collection and re-index."""
    Authz.flush()
    refresh_collection(collection.id)
    return index.index_collection(collection, sync=sync)
コード例 #29
0
ファイル: collections.py プロジェクト: pudo/aleph
def update_collection(collection, sync=False):
    """Create or update a collection."""
    Authz.flush()
    refresh_collection(collection.id)
    return index.index_collection(collection, sync=sync)
コード例 #30
0
ファイル: collections.py プロジェクト: jbaehne/aleph
def index_collections():
    for collection in Collection.all(deleted=True):
        log.info("Index [%s]: %s", collection.id, collection.label)
        index.index_collection(collection)
コード例 #31
0
ファイル: collections.py プロジェクト: jorgeguilherme/aleph
def index_collections():
    for collection in Collection.all(deleted=True):
        index.index_collection(collection)
コード例 #32
0
ファイル: collections.py プロジェクト: jorgeguilherme/aleph
def update_collection(collection, sync=False):
    """Create or update a collection."""
    Authz.flush()
    refresh_collection(collection.id)
    return index.index_collection(collection, sync=sync)