コード例 #1
0
ファイル: entities.py プロジェクト: GelLiNN/aleph
def index_entity(entity):
    """Index an entity."""
    if entity.deleted_at is not None:
        return delete_entity(entity.id)

    data = {
        'foreign_ids': entity.foreign_ids,
        'data': entity.data,
        'created_at': entity.created_at,
        'updated_at': entity.updated_at,
        'bulk': False,
        'roles': entity.collection.roles,
        'collection_id': entity.collection_id,
        'properties': {
            'name': [entity.name]
        }
    }

    for k, v in entity.data.items():
        data['properties'][k] = ensure_list(v)

    # data['$documents'] = get_count(entity)
    data = finalize_index(data, entity.schema)
    es.index(index=entity_index(),
             doc_type=entity_type(),
             id=entity.id,
             body=data)
    data['id'] = entity.id
    return data
コード例 #2
0
ファイル: entities.py プロジェクト: arezola/aleph
def _index_updates(collection, entities):
    """Look up existing index documents and generate an updated form.

    This is necessary to make the index accumulative, i.e. if an entity or link
    gets indexed twice with different field values, it'll add up the different
    field values into a single record. This is to avoid overwriting the
    document and losing field values. An alternative solution would be to
    implement this in Groovy on the ES.
    """
    common = {
        'collection_id': collection.id,
        'bulk': True,
        'roles': collection.roles,
        'updated_at': datetime.utcnow()
    }
    if not len(entities):
        return

    query = {
        'query': {
            'ids': {
                'values': list(entities.keys())
            }
        },
        '_source': ['schema', 'properties', 'created_at']
    }
    result = search_safe(index=entity_index(), body=query)
    for doc in result.get('hits').get('hits', []):
        entity_id = doc['_id']
        entity = entities.get(entity_id)
        existing = doc.get('_source')
        combined = model.merge(existing, entity)
        combined['created_at'] = existing.get('created_at')
        entities[entity_id] = combined

    for doc_id, entity in entities.items():
        entity.pop('id', None)
        entity.update(common)
        schema = model.get(entity.get('schema'))
        entity = finalize_index(entity, schema, [])
        # pprint(entity)
        yield {
            '_id': doc_id,
            '_index': entity_index(),
            '_type': 'doc',
            '_source': entity
        }
コード例 #3
0
ファイル: entities.py プロジェクト: mustafaascha/aleph
def index_single(obj, proxy, data, texts, sync=False):
    """Indexing aspects common to entities and documents."""
    data = finalize_index(proxy, data, texts)
    data['bulk'] = False
    data['collection_id'] = obj.collection.id
    data['created_at'] = obj.created_at
    data['updated_at'] = obj.updated_at
    # pprint(data)
    refresh = 'wait_for' if sync else False
    return index_safe(entity_index(), obj.id, data, refresh=refresh)
コード例 #4
0
def index_single(obj, data, texts):
    """Indexing aspects common to entities and documents."""
    data['bulk'] = False
    data['roles'] = obj.collection.roles
    data['collection_id'] = obj.collection.id
    data['created_at'] = obj.created_at
    data['updated_at'] = obj.updated_at
    data = finalize_index(data, obj.model, texts)
    data = clean_dict(data)
    return index_safe(entity_index(), obj.id, data)
コード例 #5
0
ファイル: ingest_api.py プロジェクト: dkhurshudian/aleph
def ingest_upload(id):
    collection = get_db_collection(id, request.authz.WRITE)
    meta, foreign_id = _load_metadata(collection)
    parent_id = _load_parent(collection, meta)
    upload_dir = mkdtemp(prefix='aleph.upload.')
    try:
        documents = []
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id,
                                        content_hash=content_hash)
            document.update(meta)
            document.uploader_id = request.authz.id
            ingest_document(document, path)
            documents.append(document)

        if not len(request.files):
            # If there is no files uploaded, try to create an empty
            # directory instead. Maybe this should be more explicit,
            # but it seemed like the most simple way of fitting it
            # into the API.
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id)
            document.schema = Document.SCHEMA_FOLDER
            document.update(meta)
            document.uploader_id = request.authz.id
            ingest_document(document, None)
            documents.append(document)
    finally:
        shutil.rmtree(upload_dir)

    if collection.casefile:
        for document in documents:
            params = {'document': document, 'collection': collection}
            publish(Events.INGEST_DOCUMENT,
                    actor_id=document.uploader_id,
                    params=params)

    # Update child counts in index.
    if parent_id is not None:
        index_document_id.apply_async([parent_id], priority=1)

    refresh_index(index=entity_index())
    return jsonify({
        'status':
        'ok',
        'documents': [CombinedSchema().dump(d).data for d in documents]
    })
コード例 #6
0
ファイル: entities.py プロジェクト: public-people/aleph
def index_single(obj, data, texts):
    """Indexing aspects common to entities and documents."""
    data['bulk'] = False
    data['roles'] = obj.collection.roles
    data['collection_id'] = obj.collection.id
    data['created_at'] = obj.created_at
    data['updated_at'] = obj.updated_at
    data = finalize_index(data, obj.model, texts)
    data = clean_dict(data)
    es.index(index=entity_index(), doc_type='doc', id=str(obj.id), body=data)
    data['id'] = str(obj.id)
    return data
コード例 #7
0
def update_roles(collection):
    """Update the role visibility of objects which are part of collections."""
    roles = ', '.join([str(r) for r in collection.roles])
    body = {
        'query': {'term': {'collection_id': collection.id}},
        'script': {
            'inline': 'ctx._source.roles = [%s]' % roles
        }
    }
    es.update_by_query(index=entity_index(),
                       body=body,
                       wait_for_completion=False)
コード例 #8
0
def upgrade_search():
    """Add any missing properties to the index mappings."""
    INDEXES = [
        (collection_index(), COLLECTION_MAPPING),
        (entity_index(), ENTITY_MAPPING),
        (record_index(), RECORD_MAPPING),
    ]
    for (index, mapping) in INDEXES:
        log.info("Creating index: %s", index)
        es.indices.create(index, ignore=[404, 400])
        es.indices.put_mapping(index=index, doc_type='doc', body=mapping)
        es.indices.open(index=index, ignore=[400, 404])
        es.indices.refresh(index=index)
コード例 #9
0
ファイル: stats.py プロジェクト: GelLiNN/aleph
def get_collection_stats(collection_id):
    """Compute some statistics on the content of a collection."""
    query = {
        'size': 0,
        'query': {
            'term': {
                'collection_id': collection_id
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            },
            'countries': {
                'terms': {
                    'field': 'countries',
                    'size': 250
                }
            },
            'languages': {
                'terms': {
                    'field': 'languages',
                    'size': 100
                }
            },
        }
    }
    result = es.search(index=entity_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'schemata': {}, 'count': result['hits']['total']}

    # expose entities by schema count.
    for schema in aggregations['schema']['buckets']:
        data['schemata'][schema['key']] = schema['doc_count']

    # if no countries or langs are given, take the most common from the data.
    if not len(data.get('countries', [])):
        countries = aggregations['countries']['buckets']
        data['countries'] = [c['key'] for c in countries]

    if not len(data.get('languages', [])):
        countries = aggregations['languages']['buckets']
        data['languages'] = [c['key'] for c in countries]

    # pprint(data)
    return data
コード例 #10
0
def update_collection_roles(collection):
    """Update the role visibility of objects which are part of collections."""
    roles = ', '.join([str(r) for r in collection.roles])
    body = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        'script': {
            'inline': 'ctx._source.roles = [%s]' % roles
        }
    }
    query_update(entity_index(), body)
コード例 #11
0
ファイル: util.py プロジェクト: jigsawsecurity/aleph
    def setUp(self):
        if not hasattr(TestCase, '_global_test_state'):
            TestCase._global_test_state = True
            delete_index()
            upgrade_search()
        else:
            indexes = [collection_index(), entity_index(), record_index()]
            es.delete_by_query(index=indexes,
                               body={'query': {
                                   'match_all': {}
                               }},
                               refresh=True,
                               conflicts='proceed')

        destroy_db()
        db.create_all()
        create_system_roles()
コード例 #12
0
ファイル: admin.py プロジェクト: atom-cmd/eskom-enquiry
def upgrade_search():
    """Add any missing properties to the index mappings."""
    INDEXES = [
        (collections_index(), COLLECTION_MAPPING),
        (entity_index(), ENTITY_MAPPING),
        (record_index(), RECORD_MAPPING),
    ]
    for (index, mapping) in INDEXES:
        log.info("Creating index: %s", index)
        settings = deepcopy(INDEX_SETTINGS)
        if index == record_index():
            # optimise records for bulk write
            settings['index']['refresh_interval'] = '-1'
        es.indices.create(index, body=settings, ignore=[404, 400])
        es.indices.put_mapping(index=index, doc_type='doc', body=mapping)
        es.indices.open(index=index, ignore=[400, 404])
        es.indices.refresh(index=index, ignore=[400, 404])
        es.indices.clear_cache(index=index, ignore=[400, 404])
コード例 #13
0
ファイル: entities.py プロジェクト: mustafaascha/aleph
def _index_updates(collection, entities):
    """Look up existing index documents and generate an updated form.

    This is necessary to make the index accumulative, i.e. if an entity or link
    gets indexed twice with different field values, it'll add up the different
    field values into a single record. This is to avoid overwriting the
    document and losing field values. An alternative solution would be to
    implement this in Groovy on the ES.
    """
    common = {
        'collection_id': collection.id,
        'updated_at': datetime.utcnow(),
        'bulk': True
    }
    timestamps = {}
    if not len(entities):
        return []

    for result in iter_entities_by_ids(list(entities.keys())):
        if int(result.get('collection_id')) != collection.id:
            raise RuntimeError("Key collision between collections.")
        existing = model.get_proxy(result)
        entities[existing.id].merge(existing)
        timestamps[existing.id] = result.get('created_at')

    actions = []
    for entity_id, entity in entities.items():
        context = dict(common)
        context['created_at'] = timestamps.get(entity.id)
        entity = finalize_index(entity, context, [])
        # pprint(entity)
        actions.append({
            '_id': entity_id,
            '_index': entity_index(),
            '_type': 'doc',
            '_source': entity
        })
    return actions
コード例 #14
0
ファイル: entities.py プロジェクト: mustafaascha/aleph
def iter_entities_by_ids(ids, authz=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    for i in range(0, len(ids), MAX_PAGE):
        chunk = ids[i:i + MAX_PAGE]
        if not len(chunk):
            return
        query = bool_query()
        query['bool']['filter'].append({'ids': {'values': chunk}})
        if authz is not None:
            query['bool']['filter'].append(authz_query(authz))
        includes = ['schema', 'properties', 'collection_id', 'created_at']
        query = {
            'query': query,
            '_source': {'includes': includes},
            'size': min(MAX_PAGE, len(chunk) * 2)
        }
        result = search_safe(index=entity_index(),
                             body=query,
                             request_cache=False)
        for doc in result.get('hits', {}).get('hits', []):
            entity = unpack_result(doc)
            if entity is not None:
                yield entity
コード例 #15
0
def all_indexes():
    return [collection_index(), entity_index(), record_index()]
コード例 #16
0
def clear_index():
    indexes = [collection_index(), entity_index(), record_index()]
    q = {'query': {'match_all': {}}}
    es.delete_by_query(index=indexes, body=q, refresh=True)
コード例 #17
0
ファイル: documents.py プロジェクト: GelLiNN/aleph
def index_document(document):
    if document.status == Document.STATUS_PENDING:
        return

    log.info("Index document [%s]: %s", document.id, document.title)
    schema = model.get(Document.SCHEMA)
    data = {
        'schema': schema.name,
        'schemata': schema.names,
        'collection_id': document.collection_id,
        'roles': document.collection.roles,
        'type': document.type,
        'status': document.status,
        'content_hash': document.content_hash,
        'foreign_id': document.foreign_id,
        'error_message': document.error_message,
        'uploader_id': document.uploader_id,
        'created_at': document.created_at,
        'updated_at': document.updated_at,
        'title': document.title,
        'name': document.title,
        'summary': document.summary,
        'author': document.author,
        'file_size': document.file_size,
        'file_name': document.file_title,
        'source_url': document.source_url,
        'languages': document.languages,
        'countries': document.countries,
        'keywords': document.keywords,
        'dates': document.dates,
        'extension': document.extension,
        'encoding': document.encoding,
        'mime_type': document.mime_type,
        'pdf_version': document.pdf_version,
        'columns': document.columns,
        'children': document.children.count(),
        'text': index_form(document.texts)
    }
    if document.parent_id is not None:
        data['parent'] = {
            'id': document.parent_id,
            'type': document.parent.type,
            'title': document.parent.title,
        }

    q = db.session.query(DocumentTag)
    q = q.filter(DocumentTag.document_id == document.id)
    for tag in q.yield_per(5000):
        field = TAG_FIELDS.get(tag.type)
        if field is None:
            log.warning("Cannot index document tag: %r", tag)
            continue
        if field not in data:
            data[field] = []
        data[field].append(tag.text)

    index_names(data)
    data = clean_dict(data)
    # pprint(data)
    es.index(index=entity_index(),
             doc_type=entity_type(),
             body=data,
             id=document.id)
    data['id'] = document.id
    return data
コード例 #18
0
def flush_index():
    """Run a refresh to apply all indexing changes."""
    es.indices.refresh(index=collection_index())
    es.indices.refresh(index=entity_index())
    es.indices.refresh(index=record_index())
コード例 #19
0
def delete_index():
    es.indices.delete(collection_index(), ignore=[404, 400])
    es.indices.delete(entity_index(), ignore=[404, 400])
    es.indices.delete(record_index(), ignore=[404, 400])