Exemple #1
0
    def test_delete_expired(self):
        q = Export.by_role_id(self.role_email.id)
        assert q.count() == 3, q.count()

        delete_expired_exports()
        q = Export.by_role_id(self.role_email.id)
        assert q.count() == 1, q.count()
        exp1 = Export.by_id(self.export1.id, deleted=False)
        assert exp1 is not None
        assert exp1.deleted is False
        exp2 = Export.by_id(self.export2.id, deleted=True)
        assert exp2 is not None
        assert exp2.deleted is True

        path = archive.load_file(self.export1.content_hash)
        assert path is not None
        assert path.exists()

        exp1.expires_at = datetime.utcnow() + timedelta(days=-1)
        db.session.add(exp1)
        db.session.commit()

        delete_expired_exports()
        q = Export.by_role_id(self.role_email.id)
        assert q.count() == 0, q.count()
        exp1 = Export.by_id(self.export1.id, deleted=True)
        assert exp1 is not None
        assert exp1.deleted is True
        path = archive.load_file(self.export1.content_hash)
        assert path is not None, path

        path = archive.load_file(self.export3.content_hash)
        assert path is None, path
Exemple #2
0
    def test_create(self):
        assert self.export1.content_hash is not None
        assert self.export1.content_hash == self.export2.content_hash
        assert archive.load_file(self.export1.content_hash) is not None
        assert archive.load_file(
            self.export1.content_hash) == archive.load_file(
                self.export2.content_hash)
        assert self.export1.file_name == self.export2.file_name == "experts.csv"

        res = get_notifications(self.role_email)
        notification_count = res.get("hits").get("total").get("value")
        assert notification_count == 3, notification_count
Exemple #3
0
def ingest(document_id, file_path=None):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    # Work path will be used by storagelayer to cache a local
    # copy of data from an S3-based archive, and by ingestors
    # to perform processing and generate intermediary files.
    work_path = mkdtemp(prefix="aleph.ingest.")
    if file_path is None:
        file_path = archive.load_file(document.content_hash,
                                      file_name=document.safe_file_name,
                                      temp_path=work_path)

    try:
        manager = get_manager()
        result = DocumentResult(manager, document, file_path=file_path)
        get_manager().ingest(file_path, result=result, work_path=work_path)

        log.debug('Ingested [%s:%s]: %s', document.id, document.schema,
                  document.name)
        db.session.commit()
        process_document(document)
    except Exception:
        db.session.rollback()
        document = Document.by_id(document_id)
        log.exception("Ingest failed [%s]: %s", document.id, document.name)
    finally:
        # Removing the temp_path given to storagelayer makes it redundant
        # to also call cleanup on the archive.
        remove_directory(work_path)
Exemple #4
0
def file(document_id):
    document = get_document(document_id)
    url = archive.generate_url(document.meta)
    if url is not None:
        return redirect(url)

    local_path = archive.load_file(document.meta)
    fh = open(local_path, 'rb')
    return send_file(fh, as_attachment=True,
                     attachment_filename=document.meta.file_name,
                     mimetype=document.meta.mime_type)
Exemple #5
0
 def dispatch(cls, collection_id, meta):
     local_path = archive.load_file(meta)
     try:
         best_cls = cls.auction_file(meta, local_path)
         log.debug("Dispatching %r to %r", meta.file_name, best_cls)
         best_cls(collection_id).ingest(meta, local_path)
         db.session.commit()
     except Exception as exc:
         cls.handle_exception(meta, collection_id, exc)
     finally:
         archive.cleanup_file(meta)
Exemple #6
0
def pdf(document_id):
    document = get_document(document_id)
    if document.type != Document.TYPE_TEXT:
        raise BadRequest("PDF is only available for text documents")
    pdf = document.meta.pdf
    url = archive.generate_url(pdf)
    if url is not None:
        return redirect(url)

    local_path = archive.load_file(pdf)
    fh = open(local_path, 'rb')
    return send_file(fh, mimetype=pdf.mime_type)
Exemple #7
0
def _get_table_csv_link(table):
    proxy = model.get_proxy(table)
    csv_hash = proxy.first("csvHash")
    if csv_hash is None:
        raise RuntimeError("Source table doesn't have a CSV version")
    url = archive.generate_url(csv_hash)
    if url is None:
        local_path = archive.load_file(csv_hash)
        if local_path is not None:
            url = local_path.as_posix()
    if url is None:
        raise RuntimeError("Could not generate CSV URL for the table")
    return url
Exemple #8
0
def write_document(export_dir, zf, collection, entity):
    content_hash = entity.first("contentHash", quiet=True)
    if content_hash is None:
        return
    file_name = entity_filename(entity)
    arcname = "{0}-{1}".format(entity.id, file_name)
    arcname = os.path.join(collection.get("label"), arcname)
    try:
        local_path = archive.load_file(content_hash, temp_path=export_dir)
        if local_path is not None and os.path.exists(local_path):
            zf.write(local_path, arcname=arcname)
    finally:
        archive.cleanup_file(content_hash, temp_path=export_dir)
Exemple #9
0
def retrieve():
    """Downloads a binary blob from the blob storage archive.
    ---
    get:
      summary: Download a blob from the archive
      parameters:
      - description: Authorization token for an archive blob
        in: query
        name: claim
        schema:
          type: string
          description: A signed JWT with the object hash.
      responses:
        '200':
          description: OK
          content:
            '*/*': {}
        '404':
          description: Object does not exist.
      tags:
      - Archive
    """
    token = request.args.get("token")
    token = jwt.decode(token, key=settings.SECRET_KEY, verify=True)
    content_hash = token.get("c")
    file_name = token.get("f")
    mime_type = token.get("m")
    expire = datetime.utcfromtimestamp(token["exp"])
    tag_request(content_hash=content_hash, file_name=file_name)
    url = archive.generate_url(
        content_hash,
        file_name=file_name,
        mime_type=mime_type,
        expire=expire,
    )
    if url is not None:
        return redirect(url)
    try:
        local_path = archive.load_file(content_hash)
        if local_path is None:
            return Response(status=404)
        return send_file(
            str(local_path),
            as_attachment=True,
            conditional=True,
            attachment_filename=file_name,
            mimetype=mime_type,
        )
    finally:
        archive.cleanup_file(content_hash)
Exemple #10
0
def get_table_csv_link(table_id):
    table = get_entity(table_id)
    properties = table.get('properties', {})
    csv_hash = first(properties.get('csvHash'))
    if csv_hash is None:
        raise RuntimeError("Source table doesn't have a CSV version")
    url = archive.generate_url(csv_hash)
    if not url:
        local_path = archive.load_file(csv_hash)
        if local_path is not None:
            url = local_path.as_posix()
    if url is None:
        raise RuntimeError("Could not generate CSV URL for the table")
    return url
Exemple #11
0
def write_document(zip_archive, collection, entity):
    if not entity.has('contentHash', quiet=True):
        return
    name = entity.first('fileName') or entity.caption
    name = "{0}-{1}".format(entity.id, name)
    path = os.path.join(collection.get('label'), name)
    content_hash = entity.first('contentHash')
    url = archive.generate_url(content_hash)
    if url is not None:
        stream = requests.get(url, stream=True)
        zip_archive.write_iter(path, stream.iter_content())
    else:
        local_path = archive.load_file(content_hash)
        if local_path is not None:
            zip_archive.write(local_path, arcname=path)
Exemple #12
0
def make_mapper(collection, mapping):
    table = get_entity(mapping.table_id)
    properties = table.get('properties', {})
    csv_hash = first(properties.get('csvHash'))
    if csv_hash is None:
        raise RuntimeError("Source table doesn't have a CSV version")
    url = archive.generate_url(csv_hash)
    if not url:
        local_path = archive.load_file(csv_hash)
        if local_path is not None:
            url = local_path.as_posix()
    if url is None:
        raise RuntimeError("Could not generate CSV URL for the table")
    data = {'csv_url': url, 'entities': mapping.query}
    return model.make_mapping(data, key_prefix=collection.foreign_id)
Exemple #13
0
def pdf(document_id):
    document = get_document(document_id)
    enable_cache(server_side=True)
    log_event(request, document_id=document.id)
    if document.type != Document.TYPE_TEXT:
        raise BadRequest("PDF is only available for text documents")
    url = archive.generate_url(document.pdf_version, mime_type=PDF_MIME)
    if url is not None:
        return redirect(url)

    path = archive.load_file(document.pdf_version,
                             file_name=document.file_name)
    if path is None:
        raise NotFound("Missing PDF file.")
    return send_file(open(path, 'rb'), mimetype=PDF_MIME)
Exemple #14
0
def write_document(zip_archive, collection, entity):
    if not entity.has('contentHash', quiet=True):
        return
    name = entity.first('fileName') or entity.caption
    name = "{0}-{1}".format(entity.id, name)
    path = os.path.join(collection.get('label'), name)
    content_hash = entity.first('contentHash')
    url = archive.generate_url(content_hash)
    if url is not None:
        stream = requests.get(url, stream=True)
        zip_archive.write_iter(path, stream.iter_content())
    else:
        local_path = archive.load_file(content_hash)
        if local_path is not None:
            zip_archive.write(local_path, arcname=path)
Exemple #15
0
def ingest(document_id, file_path=None, refresh=False):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    # Work path will be used by storagelayer to cache a local
    # copy of data from an S3-based archive, and by ingestors
    # to perform processing and generate intermediary files.
    work_path = mkdtemp(prefix="aleph.ingest.")
    if file_path is None:
        file_path = archive.load_file(document.content_hash,
                                      file_name=document.safe_file_name,
                                      temp_path=work_path)

    try:
        manager = get_manager()
        result = DocumentResult(manager, document, file_path=file_path)
        get_manager().ingest(file_path, result=result, work_path=work_path)

        document.status = Document.STATUS_SUCCESS
        log.debug('Ingested [%s:%s]: %s', document.id, document.schema,
                  document.name)

        if document.collection.casefile and not refresh:
            params = {'collection': document.collection, 'document': document}
            publish(Events.INGEST_DOCUMENT,
                    actor_id=document.uploader_id,
                    params=params)

        db.session.commit()
    except Exception:
        db.session.rollback()
        document = Document.by_id(document_id)
        log.exception("Ingest failed [%s]: %s", document.id, document.name)
        document.status = Document.STATUS_FAIL
        db.session.commit()
    finally:
        # Removing the temp_path given to storagelayer makes it redundant
        # to also call cleanup on the archive.
        remove_directory(work_path)

    extract_document_tags(document)
    # delete_entity(document.id, exclude=document.schema)
    index_document(document)
    refresh_entity(document)
Exemple #16
0
def file(document_id):
    document = get_document(document_id)
    enable_cache(server_side=True)
    log_event(request, document_id=document.id)
    url = archive.generate_url(document.meta)
    if url is not None:
        return redirect(url)

    local_path = archive.load_file(document.meta)
    if not os.path.isfile(local_path):
        raise NotFound("File does not exist.")

    fh = open(local_path, 'rb')
    return send_file(fh, as_attachment=True,
                     attachment_filename=document.meta.file_name,
                     mimetype=document.meta.mime_type)
Exemple #17
0
def pdf(document_id):
    document = get_document(document_id)
    enable_cache(server_side=True)
    log_event(request, document_id=document.id)
    if document.type != Document.TYPE_TEXT:
        raise BadRequest("PDF is only available for text documents")
    pdf = document.meta.pdf
    url = archive.generate_url(pdf)
    if url is not None:
        return redirect(url)

    try:
        local_path = archive.load_file(pdf)
        fh = open(local_path, 'rb')
    except Exception as ex:
        raise NotFound("Missing PDF file: %r" % ex)
    return send_file(fh, mimetype=pdf.mime_type)
Exemple #18
0
def retrieve():
    """Downloads a binary blob from the blob storage archive.
    ---
    get:
      summary: Download a blob from the archive
      parameters:
      - description: Authorization token for an archive blob
        in: query
        name: claim
        schema:
          type: string
          description: A signed JWT with the object hash.
      responses:
        '200':
          description: OK
          content:
            '*/*': {}
        '404':
          description: Object does not exist.
      tags:
      - Archive
    """
    claim = request.args.get("claim")
    role_id, content_hash, file_name, mime_type = archive_claim(claim)
    require(request.authz.id == role_id)
    tag_request(content_hash=content_hash, file_name=file_name)
    url = archive.generate_url(content_hash,
                               file_name=file_name,
                               mime_type=mime_type)
    if url is not None:
        return redirect(url)
    try:
        local_path = archive.load_file(content_hash)
        if local_path is None:
            return Response(status=404)
        return send_file(
            str(local_path),
            as_attachment=True,
            conditional=True,
            attachment_filename=file_name,
            mimetype=mime_type,
        )
    finally:
        archive.cleanup_file(content_hash)
Exemple #19
0
def _serve_archive(content_hash, file_name, mime_type):
    """Serve a file from the archive or by generating an external URL."""
    url = archive.generate_url(content_hash,
                               file_name=file_name,
                               mime_type=mime_type)
    if url is not None:
        return redirect(url)

    try:
        local_path = archive.load_file(content_hash, file_name=file_name)
        if local_path is None:
            return Response(status=404)

        return send_file(local_path,
                         as_attachment=True,
                         conditional=True,
                         attachment_filename=file_name,
                         mimetype=mime_type)
    finally:
        archive.cleanup_file(content_hash)
Exemple #20
0
 def dispatch(cls, source_id, meta):
     best_score, best_cls = 0, None
     local_path = archive.load_file(meta)
     try:
         for cls in get_ingestors().values():
             score = cls.match(meta, local_path)
             if score > best_score:
                 best_score = score
                 best_cls = cls
         if best_cls is None:
             log.debug("No ingestor found for: %r", meta.file_name)
             return
         log.debug("Dispatching %r to %r", meta.file_name,
                   best_cls.__name__)
         best_cls(source_id).ingest(meta, local_path)
     except Exception as ex:
         log.exception(ex)
         db.session.rollback()
     finally:
         archive.cleanup_file(meta)
Exemple #21
0
def _serve_archive(content_hash, file_name, mime_type):
    """Serve a file from the archive or by generating an external URL."""
    url = archive.generate_url(content_hash,
                               file_name=file_name,
                               mime_type=mime_type)
    if url is not None:
        return redirect(url)

    enable_cache()
    try:
        local_path = archive.load_file(content_hash, file_name=file_name)
        if local_path is None:
            raise NotFound("File does not exist.")

        return send_file(open(local_path, 'rb'),
                         as_attachment=True,
                         attachment_filename=file_name,
                         mimetype=mime_type)
    finally:
        archive.cleanup_file(content_hash)
Exemple #22
0
def retrieve():
    claim = request.args.get('claim')
    role_id, content_hash, file_name, mime_type = archive_claim(claim)
    require(request.authz.id == role_id)
    tag_request(content_hash=content_hash, file_name=file_name)
    url = archive.generate_url(content_hash,
                               file_name=file_name,
                               mime_type=mime_type)
    if url is not None:
        return redirect(url)
    try:
        local_path = archive.load_file(content_hash)
        if local_path is None:
            return Response(status=404)
        return send_file(str(local_path),
                         as_attachment=True,
                         conditional=True,
                         attachment_filename=file_name,
                         mimetype=mime_type)
    finally:
        archive.cleanup_file(content_hash)
Exemple #23
0
def retrieve():
    claim = request.args.get('claim')
    role_id, content_hash, file_name, mime_type = archive_claim(claim)
    require(request.authz.id == role_id)
    record_audit(Audit.ACT_ARCHIVE, content_hash=content_hash)
    tag_request(content_hash=content_hash, file_name=file_name)
    url = archive.generate_url(content_hash,
                               file_name=file_name,
                               mime_type=mime_type)
    if url is not None:
        return redirect(url)
    try:
        local_path = archive.load_file(content_hash)
        if local_path is None:
            return Response(status=404)
        return send_file(local_path,
                         as_attachment=True,
                         conditional=True,
                         attachment_filename=file_name,
                         mimetype=mime_type)
    finally:
        archive.cleanup_file(content_hash)