Exemple #1
0
def crawl_directory(collection, path, parent=None, job_id=None):
    """Crawl the contents of the given path."""
    try:
        content_hash = None
        if not path.is_dir():
            content_hash = archive.archive_file(path)
        foreign_id = path.name
        if parent is not None:
            foreign_id = os.path.join(parent.foreign_id, foreign_id)

        # if the job_id is not set yet and path.is_dir(), we know it is the
        # first iteration and we don't create an initial root folder as parent
        # to be consistent with the behaviour of alephclient
        if path.is_dir() and job_id is None:
            document = None
            job_id = Job.random_id()
        else:
            meta = {"file_name": path.name}
            document = Document.save(
                collection,
                parent=parent,
                foreign_id=foreign_id,
                content_hash=content_hash,
                meta=meta,
            )
            db.session.commit()
            job_id = job_id or Job.random_id()
            proxy = document.to_proxy()
            ingest_flush(collection, entity_id=proxy.id)
            ingest_entity(collection, proxy, job_id=job_id)
            log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id)

        if path.is_dir():
            for child in path.iterdir():
                crawl_directory(collection, child, document, job_id)
    except OSError:
        log.exception("Cannot crawl directory: %s", path)
Exemple #2
0
def crawl_directory(collection, path, parent=None, job_id=None):
    """Crawl the contents of the given path."""
    try:
        content_hash = None
        if not path.is_dir():
            content_hash = archive.archive_file(path)
        foreign_id = path.name
        if parent is not None:
            foreign_id = os.path.join(parent.foreign_id, foreign_id)
        meta = {'file_name': path.name}
        document = Document.save(collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta)
        db.session.commit()
        job_id = job_id or Job.random_id()
        ingest_entity(collection, document.to_proxy(), job_id=job_id)
        log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id)
        if path.is_dir():
            for child in path.iterdir():
                crawl_directory(collection, child, document, job_id)
    except OSError:
        log.exception("Cannot crawl directory: %s", path)
Exemple #3
0
def ingest_upload(collection_id):
    """
    ---
    post:
      summary: Upload a document to a collection
      description: Upload a document to a collection with id `collection_id`
      parameters:
      - in: path
        name: collection_id
        required: true
        schema:
          type: integer
      requestBody:
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                file:
                  type: string
                  format: binary
                  description: The document to upload
                meta:
                  $ref: '#/components/schemas/DocumentIngest'
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                properties:
                  id:
                    description: id of the uploaded document
                    type: integer
                  status:
                    type: string
                type: object
      tags:
      - Ingest
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    job_id = get_session_id()
    sync = get_flag('sync', default=False)
    meta, foreign_id = _load_metadata()
    parent = _load_parent(collection, meta)
    upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.'))
    try:
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = upload_dir.joinpath(path)
            storage.save(str(path))
            content_hash = archive.archive_file(path)
        document = Document.save(collection=collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta,
                                 uploader_id=request.authz.id)
        collection.touch()
        db.session.commit()
        proxy = document.to_proxy()
        if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync:
            index_proxy(collection, proxy, sync=sync)
        ingest_entity(collection, proxy, job_id=job_id, sync=sync)
        document_id = collection.ns.sign(document.id)
        _notify(collection, document_id)
    finally:
        shutil.rmtree(upload_dir)

    return jsonify({'status': 'ok', 'id': document_id}, status=201)
Exemple #4
0
# execute in the aleph shell:
# exec(open("reingest_partial.py").read())
from servicelayer.jobs import Job
from aleph.queues import ingest_entity
from aleph.model import Collection, Document

job_id = Job.random_id()
collection = Collection.by_id(125)
with open('collection_ftm_failures.csv', 'r') as f:
    for document_id in f:
        print("reingest " + document_id)
        document = Document.by_id(document_id)
        proxy = document.to_proxy(ns=collection.ns)
        ingest_entity(collection, proxy, job_id=job_id, index=True)