Example #1
0
def complete_export(export_id, file_path):
    export = Export.by_id(export_id)
    file_path = ensure_path(file_path)
    export.file_name = safe_filename(file_path)
    export.file_size = file_path.stat().st_size
    export.content_hash = checksum(file_path)
    try:
        archive.archive_file(file_path,
                             content_hash=export.content_hash,
                             mime_type=export.mime_type)
        export.set_status(status=Status.SUCCESS)
    except Exception:
        log.exception("Failed to upload export: %s", export)
        export.set_status(status=Status.FAILED)

    db.session.commit()
    params = {"export": export}
    role = Role.by_id(export.creator_id)
    log.info("Export [%r] complete: %s", export, export.status)
    publish(
        Events.COMPLETE_EXPORT,
        params=params,
        channels=[role],
    )
    send_export_notification(export)
Example #2
0
def export_entities(export_id, result):
    from aleph.logic import resolver

    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    try:
        entities = []
        stub = types.SimpleNamespace(result=result)
        for entity in result["results"]:
            resolver.queue(stub, Collection, entity.get("collection_id"))
            entities.append(model.get_proxy(entity))
        resolver.resolve(stub)

        file_path = export_dir.joinpath("query-export.zip")
        zf = zipfile.ZipFile(file_path, "w")
        exporter = ExcelExporter(None, extra=EXTRA_HEADERS)
        for entity in entities:
            collection_id = entity.context.get("collection_id")
            collection = resolver.get(stub, Collection, collection_id)
            extra = [entity_url(entity.id), collection.get("label")]
            exporter.write(entity, extra=extra)
            write_document(export_dir, zf, collection, entity)
        content = exporter.get_bytesio().getvalue()
        zf.writestr("Export.xlsx", content)
        zf.close()
        complete_export(export_id, file_path)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Export.STATUS_FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Example #3
0
def ingest_upload(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    job_id = get_session_id()
    sync = get_flag('sync', default=False)
    meta, foreign_id = _load_metadata()
    parent = _load_parent(collection, meta)
    upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.'))
    try:
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = upload_dir.joinpath(path)
            storage.save(str(path))
            content_hash = archive.archive_file(path)
        document = Document.save(collection=collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta,
                                 uploader_id=request.authz.id)
        collection.touch()
        db.session.commit()
        proxy = document.to_proxy()
        if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync:
            index_proxy(collection, proxy, sync=sync)
        ingest_entity(collection, proxy, job_id=job_id, sync=sync)
        document_id = collection.ns.sign(document.id)
        _notify(collection, document_id)
    finally:
        shutil.rmtree(upload_dir)

    return jsonify({'status': 'ok', 'id': document_id}, status=201)
Example #4
0
def export_entities(export_id):
    export = Export.by_id(export_id)
    log.info("Export entities [%r]...", export)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    collections = {}
    try:
        filters = [export.meta.get("query", {"match_none": {}})]
        file_path = export_dir.joinpath("query-export.zip")
        with ZipFile(file_path, mode="w") as zf:
            excel_path = export_dir.joinpath(EXCEL_FILE)
            exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS)
            for entity in iter_proxies(filters=filters):
                collection_id = entity.context.get("collection_id")
                if collection_id not in collections:
                    collections[collection_id] = get_collection(collection_id)
                collection = collections[collection_id]
                if collection is None:
                    continue
                extra = [entity_url(entity.id), collection.get("label")]
                exporter.write(entity, extra=extra)
                write_document(export_dir, zf, collection, entity)
                if file_path.stat().st_size >= Export.MAX_FILE_SIZE:
                    log.warn("Export too large: %r", export)
                    break

            exporter.finalize()
            zf.write(excel_path, arcname=EXCEL_FILE)
        complete_export(export_id, file_path)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Example #5
0
 def __init__(self, stage, context):
     self.stage = stage
     self.context = context
     self.work_path = ensure_path(mkdtemp(prefix='ingestor-'))
     self.emitted = set()
     self._writer = None
     self._dataset = None
Example #6
0
def ingest_upload(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    meta, foreign_id = _load_metadata()
    parent = _load_parent(collection, meta)
    upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.'))
    try:
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = upload_dir.joinpath(path)
            storage.save(str(path))
            content_hash = archive.archive_file(path)
        document = Document.save(collection=collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta,
                                 uploader_id=request.authz.id)
        db.session.commit()
        proxy = document.to_proxy()
        ingest_entity(collection, proxy)
    finally:
        shutil.rmtree(upload_dir)

    return jsonify({
        'status': 'ok',
        'id': stringify(document.id)
    }, status=201)
Example #7
0
 def set_filepath(self, file_path):
     file_path = ensure_path(file_path)
     file_name = safe_filename(file_path)
     file_size = file_path.stat().st_size
     self.file_name = file_name
     self.file_size = file_size
     self._file_path = file_path
     self.content_hash = checksum(file_path)
Example #8
0
 def __init__(self, dataset, stage, context):
     self.dataset = dataset
     self.writer = dataset.bulk()
     self.stage = stage
     self.context = context
     self.ns = Namespace(self.context.get("namespace"))
     self.work_path = ensure_path(mkdtemp(prefix="ingestor-"))
     self.emitted = set()
Example #9
0
 def _get_local_prefix(self, content_hash, temp_path=None):
     """Determine a temporary path for the file on the local file
     system."""
     if temp_path is None:
         if not hasattr(self.local, 'dir'):
             self.local.dir = tempfile.mkdtemp(prefix=self.base_name)
         temp_path = self.local.dir
     return ensure_path(temp_path).joinpath('%s.sl' % content_hash)
Example #10
0
 def __init__(self, queue, context):
     self.queue = queue
     # TODO: Probably a good idea to make context readonly since we are
     # reusing it in child ingestors
     self.context = context
     self.work_path = ensure_path(mkdtemp(prefix='ingestor-'))
     self._emit_count = 0
     self._writer = None
     self._dataset = None
Example #11
0
 def make_work_file(self, file_name, prefix=None):
     if prefix is not None:
         prefix = ensure_path(prefix)
         if self.manager.work_path not in prefix.parents:
             raise ProcessingException("Path escalation: %r" % prefix)
     prefix = prefix or self.manager.work_path
     work_file = prefix.joinpath(file_name)
     if prefix not in work_file.parents:
         raise ProcessingException("Path escalation: %r" % file_name)
     if not work_file.parent.exists():
         work_file.parent.mkdir(parents=True, exist_ok=True)
     return work_file
Example #12
0
 def extract_hierarchy(self, entity, name):
     """Given a file path, create all its ancestor folders as entities"""
     foreign_id = pathlib.PurePath(entity.id)
     path = ensure_path(name)
     for name in path.as_posix().split("/")[:-1]:
         foreign_id = foreign_id.joinpath(name)
         if name in self.EXCLUDE:
             continue
         entity = self.manager.make_entity("Folder", parent=entity)
         entity.add("fileName", name)
         entity.make_id(foreign_id.as_posix())
         self.manager.emit_entity(entity)
     return entity
Example #13
0
def export_matches(export_id):
    """Export the top N matches of cross-referencing for the given collection
    to an Excel formatted export."""
    export = Export.by_id(export_id)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    try:
        role = Role.by_id(export.creator_id)
        authz = Authz.from_role(role)
        collection = Collection.by_id(export.collection_id)
        file_name = "%s - Crossreference.xlsx" % collection.label
        file_path = export_dir.joinpath(file_name)
        excel = ExcelWriter()
        headers = [
            "Score",
            "Entity Name",
            "Entity Date",
            "Entity Countries",
            "Candidate Collection",
            "Candidate Name",
            "Candidate Date",
            "Candidate Countries",
            "Entity Link",
            "Candidate Link",
        ]
        sheet = excel.make_sheet("Cross-reference", headers)
        batch = []

        for match in iter_matches(collection, authz):
            batch.append(match)
            if len(batch) >= BULK_PAGE:
                _iter_match_batch(excel, sheet, batch)
                batch = []
        if len(batch):
            _iter_match_batch(excel, sheet, batch)

        with open(file_path, "wb") as fp:
            buffer = excel.get_bytesio()
            for data in buffer:
                fp.write(data)

        complete_export(export_id, file_path)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Example #14
0
def _ingest_path(db, conn, dataset, path, languages=[]):
    context = {'languages': languages}
    job = Job.create(conn, dataset)
    stage = job.get_stage(OP_INGEST)
    manager = Manager(db, stage, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity('Document')
            checksum = manager.store(path)
            entity.set('contentHash', checksum)
            entity.make_id(checksum)
            entity.set('fileName', path.name)
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
Example #15
0
    def ingest(self, file_path, entity, **kwargs):
        """Main execution step of an ingestor."""
        file_path = ensure_path(file_path)
        if file_path.is_file() and not entity.has("fileSize"):
            entity.add("fileSize", file_path.stat().st_size)

        entity.set("processingStatus", self.STATUS_FAILURE)
        try:
            ingestor_class = self.auction(file_path, entity)
            log.info("Ingestor [%r]: %s", entity, ingestor_class.__name__)
            self.delegate(ingestor_class, file_path, entity)
            entity.set("processingStatus", self.STATUS_SUCCESS)
        except ProcessingException as pexc:
            entity.set("processingError", stringify(pexc))
            log.error("[%r] Failed to process: %s", entity, pexc)
        finally:
            self.finalize(entity)
Example #16
0
 def fixture(self, fixture_path):
     """Returns a fixture path and a dummy entity"""
     # clear out entities
     self.manager.entities = []
     self.manager.dataset.delete()
     cur_path = ensure_path(__file__).parent
     cur_path = cur_path.joinpath('fixtures')
     path = cur_path.joinpath(fixture_path)
     entity = self.manager.make_entity('Document')
     if path.is_file():
         checksum = self.manager.store(path)
         entity.make_id(path.name, checksum)
         entity.set('contentHash', checksum)
         entity.set('fileSize', path.stat().st_size)
         entity.set('fileName', path.name)
     else:
         entity.make_id(fixture_path)
     return path, entity
Example #17
0
def _ingest_path(db, conn, dataset, path, languages=[]):
    context = {"languages": languages}
    job = Job.create(conn, dataset)
    stage = job.get_stage(OP_INGEST)
    manager = Manager(db, stage, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity("Document")
            checksum = manager.store(path)
            entity.set("contentHash", checksum)
            entity.make_id(checksum)
            entity.set("fileName", path.name)
            log.info("Queue: %r", entity.to_dict())
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
Example #18
0
def ingest(path, dataset, languages=None):
    """Queue a set of files for ingest."""
    context = {'languages': languages}
    conn = get_redis()
    queue = ServiceQueue(conn, ServiceQueue.OP_INGEST, dataset)
    manager = Manager(queue, context)
    path = ensure_path(path)
    if path is not None:
        if path.is_file():
            entity = manager.make_entity('Document')
            checksum = manager.store(path)
            entity.set('contentHash', checksum)
            entity.make_id(checksum)
            entity.set('fileName', path.name)
            manager.queue_entity(entity)
        if path.is_dir():
            DirectoryIngestor.crawl(manager, path)
    manager.close()
Example #19
0
File: export.py Project: sunu/aleph
def export_entities(export_id):
    export = Export.by_id(export_id)
    log.info("Export entities [%r]...", export)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    collections = {}
    try:
        filters = [export.meta.get("query", {"match_none": {}})]
        file_path = export_dir.joinpath("export.zip")
        with ZipFile(file_path, mode="w") as zf:
            excel_name = safe_filename(export.label, extension="xlsx")
            excel_path = export_dir.joinpath(excel_name)
            exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS)
            for idx, entity in enumerate(iter_proxies(filters=filters)):
                collection_id = entity.context.get("collection_id")
                if collection_id not in collections:
                    collections[collection_id] = get_collection(collection_id)
                collection = collections[collection_id]
                if collection is None:
                    continue
                extra = [entity_url(entity.id), collection.get("label")]
                exporter.write(entity, extra=extra)
                write_document(export_dir, zf, collection, entity)
                if file_path.stat().st_size >= settings.EXPORT_MAX_SIZE:
                    concern = "total size of the"
                    zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern)
                    break
                if idx >= settings.EXPORT_MAX_RESULTS:
                    concern = "number of"
                    zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern)
                    break

            exporter.finalize()
            zf.write(excel_path, arcname=excel_name)
        file_name = "Export: %s" % export.label
        file_name = safe_filename(file_name, extension="zip")
        complete_export(export_id, file_path, file_name)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Example #20
0
 def fixture(self, fixture_path):
     """Returns a fixture path and a dummy entity"""
     # clear out entities
     self.manager.entities = []
     self.manager.dataset.delete()
     cur_path = ensure_path(__file__).parent
     cur_path = cur_path.joinpath("fixtures")
     path = cur_path.joinpath(fixture_path)
     entity = self.manager.make_entity("Document")
     if not path.exists():
         raise RuntimeError(path)
     if path.is_file():
         checksum = self.manager.store(path)
         entity.make_id(path.name, checksum)
         entity.set("contentHash", checksum)
         entity.set("fileSize", path.stat().st_size)
         entity.set("fileName", path.name)
     else:
         entity.make_id(fixture_path)
     return path, entity
Example #21
0
    def archive_file(self, file_path, content_hash=None, mime_type=None):
        """Store the file located at the given path on S3, based on a path
        made up from its SHA1 content hash."""
        file_path = ensure_path(file_path)
        if content_hash is None:
            content_hash = checksum(file_path)

        # if content_hash is None:
        #     return

        obj = self._locate_key(content_hash)
        if obj is not None:
            return content_hash

        path = '{}/data'.format(self._get_prefix(content_hash))
        extra = {}
        if mime_type is not None:
            extra['ContentType'] = mime_type
        with open(file_path, 'rb') as fh:
            self.client.upload_fileobj(fh, self.bucket, str(path),
                                       ExtraArgs=extra)
        return content_hash
Example #22
0
    def archive_file(self, file_path, content_hash=None, mime_type=None):
        """Store the file located at the given path on Google, based on a path
        made up from its SHA1 content hash."""
        file_path = ensure_path(file_path)
        if content_hash is None:
            content_hash = checksum(file_path)

        if content_hash is None:
            return

        file_path = ensure_posix_path(file_path)
        for attempt in service_retries():
            try:
                # blob = self._locate_contenthash(content_hash)
                # if blob is not None:
                #     return content_hash

                path = os.path.join(path_prefix(content_hash), "data")
                blob = Blob(path, self.bucket)
                blob.upload_from_filename(file_path, content_type=mime_type)
                return content_hash
            except FAILURES:
                log.exception("Store error in GS")
                backoff(failures=attempt)
Example #23
0
 def setUp(self):
     self.mock = mock_s3()
     self.mock.start()
     self.archive = init_archive('s3', bucket='foo')
     self.file = ensure_path(__file__)
Example #24
0
 def setUp(self):
     tempdir = ensure_path(tempfile.gettempdir())
     self.path = tempdir.joinpath('sltest')
     self.archive = init_archive('file', path=self.path)
     self.file = ensure_path(__file__)
Example #25
0
 def __init__(self, path=None):
     self.path = ensure_path(path)
     if self.path is None:
         raise ValueError('No archive path is set.')
     log.info("Archive: %s", self.path)
Example #26
0
def ingest_upload(collection_id):
    """
    ---
    post:
      summary: Upload a document to a collection
      description: Upload a document to a collection with id `collection_id`
      parameters:
      - in: path
        name: collection_id
        required: true
        schema:
          type: integer
      requestBody:
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                file:
                  type: string
                  format: binary
                  description: The document to upload
                meta:
                  $ref: '#/components/schemas/DocumentIngest'
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                properties:
                  id:
                    description: id of the uploaded document
                    type: integer
                  status:
                    type: string
                type: object
      tags:
      - Ingest
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    job_id = get_session_id()
    sync = get_flag('sync', default=False)
    meta, foreign_id = _load_metadata()
    parent = _load_parent(collection, meta)
    upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.'))
    try:
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = upload_dir.joinpath(path)
            storage.save(str(path))
            content_hash = archive.archive_file(path)
        document = Document.save(collection=collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta,
                                 uploader_id=request.authz.id)
        collection.touch()
        db.session.commit()
        proxy = document.to_proxy()
        if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync:
            index_proxy(collection, proxy, sync=sync)
        ingest_entity(collection, proxy, job_id=job_id, sync=sync)
        document_id = collection.ns.sign(document.id)
        _notify(collection, document_id)
    finally:
        shutil.rmtree(upload_dir)

    return jsonify({'status': 'ok', 'id': document_id}, status=201)
Example #27
0
 def store(self, file_path, mime_type=None):
     file_path = ensure_path(file_path)
     mime_type = normalize_mimetype(mime_type)
     if file_path is not None and file_path.is_file():
         return self.archive.archive_file(file_path, mime_type=mime_type)
Example #28
0
 def setUp(self):
     self.mock = mock_s3()
     self.mock.start()
     self.archive = init_archive("s3", bucket="foo", publication_bucket="foo")
     self.file = ensure_path(__file__)