예제 #1
0
def complete_export(export_id, file_path):
    export = Export.by_id(export_id)
    file_path = ensure_path(file_path)
    export.file_name = safe_filename(file_path)
    export.file_size = file_path.stat().st_size
    export.content_hash = checksum(file_path)
    try:
        archive.archive_file(file_path,
                             content_hash=export.content_hash,
                             mime_type=export.mime_type)
        export.set_status(status=Status.SUCCESS)
    except Exception:
        log.exception("Failed to upload export: %s", export)
        export.set_status(status=Status.FAILED)

    db.session.commit()
    params = {"export": export}
    role = Role.by_id(export.creator_id)
    log.info("Export [%r] complete: %s", export, export.status)
    publish(
        Events.COMPLETE_EXPORT,
        params=params,
        channels=[role],
    )
    send_export_notification(export)
예제 #2
0
def ingest_file(source_id, meta, file_name, move=False):
    if not os.path.isfile(file_name):
        raise ValueError("No such file: %r", file_name)
    if not meta.has("source_path"):
        meta.source_path = file_name
    meta = archive.archive_file(file_name, meta, move=move)
    ingest.delay(source_id, meta.data)
예제 #3
0
def ingest_upload(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    job_id = get_session_id()
    sync = get_flag('sync', default=False)
    meta, foreign_id = _load_metadata()
    parent = _load_parent(collection, meta)
    upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.'))
    try:
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = upload_dir.joinpath(path)
            storage.save(str(path))
            content_hash = archive.archive_file(path)
        document = Document.save(collection=collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta,
                                 uploader_id=request.authz.id)
        collection.touch()
        db.session.commit()
        proxy = document.to_proxy()
        if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync:
            index_proxy(collection, proxy, sync=sync)
        ingest_entity(collection, proxy, job_id=job_id, sync=sync)
        document_id = collection.ns.sign(document.id)
        _notify(collection, document_id)
    finally:
        shutil.rmtree(upload_dir)

    return jsonify({'status': 'ok', 'id': document_id}, status=201)
예제 #4
0
파일: __init__.py 프로젝트: maquchizi/aleph
def ingest_url(self, collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    if meta.foreign_id is None:
        meta.foreign_id = url
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %s", url)
        res = requests.get(url, stream=True)
        if res.status_code == 404:
            log.info("HTTP not found: %s", url)
            return
        if res.status_code >= 399:
            countdown = 3600**self.request.retries
            self.retry(countdown=countdown)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = archive.archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except IOError as ioe:
        log.info("IO Failure: %r", ioe)
        countdown = 3600**self.request.retries
        self.retry(countdown=countdown)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
예제 #5
0
def ingest_upload(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    meta, foreign_id = _load_metadata()
    parent = _load_parent(collection, meta)
    upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.'))
    try:
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = upload_dir.joinpath(path)
            storage.save(str(path))
            content_hash = archive.archive_file(path)
        document = Document.save(collection=collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta,
                                 uploader_id=request.authz.id)
        db.session.commit()
        proxy = document.to_proxy()
        ingest_entity(collection, proxy)
    finally:
        shutil.rmtree(upload_dir)

    return jsonify({
        'status': 'ok',
        'id': stringify(document.id)
    }, status=201)
예제 #6
0
def ingest_document(document, file_path, user_queue=False):
    """Given a stub document and file path, extract information.
    This does not attempt to infer metadata such as a file name."""
    document.status = Document.STATUS_PENDING
    if os.path.isdir(file_path):
        manager = get_manager()
        manager.ingest_document(document, file_path=file_path)
    else:
        ch = archive.archive_file(file_path,
                                  content_hash=document.content_hash)
        document.content_hash = ch or document.content_hash
        db.session.commit()
        queue = USER_QUEUE if user_queue else WORKER_QUEUE
        routing_key = USER_ROUTING_KEY if user_queue else WORKER_ROUTING_KEY
        ingest.apply_async([document.id], queue=queue, routing_key=routing_key)
예제 #7
0
 def setUp(self):
     super(MappingAPITest, self).setUp()
     self.col = self.create_collection(foreign_id="map1")
     aggregator = get_aggregator(self.col)
     aggregator.delete()
     _, self.headers = self.login(is_admin=True)
     self.rolex = self.create_user(foreign_id="user_3")
     _, self.headers_x = self.login(foreign_id="user_3")
     self.fixture = self.get_fixture_path("experts.csv")
     self.content_hash = archive.archive_file(self.fixture)
     data = {
         "id": "foo",
         "schema": "Table",
         "properties": {
             "csvHash": self.content_hash,
             "contentHash": self.content_hash,
             "mimeType": "text/csv",
             "fileName": "experts.csv",
             "name": "experts.csv",
         },
     }
     self.ent = EntityProxy.from_dict(model, data, cleaned=False)
     self.ent.id = self.col.ns.sign(self.ent.id)
     index_proxy(self.col, self.ent)
     data = {
         "id": "foo2",
         "schema": "Table",
         "properties": {
             "csvHash": self.content_hash,
             "contentHash": self.content_hash,
             "mimeType": "text/csv",
             "fileName": "experts.csv",
             "name": "experts.csv",
         },
     }
     self.ent2 = EntityProxy.from_dict(model, data, cleaned=False)
     self.ent2.id = self.col.ns.sign(self.ent2.id)
     index_proxy(self.col, self.ent2)
     data = {
         "id": "bar",
         "schema": "LegalEntity",
         "properties": {
             "name": "John Doe"
         },
     }
     ent = EntityProxy.from_dict(model, data, cleaned=False)
     ent.id = self.col.ns.sign(ent.id)
     index_proxy(self.col, ent)
예제 #8
0
 def setUp(self):
     super(MappingAPITest, self).setUp()
     self.col = self.create_collection(data={'foreign_id': 'map1'})
     _, self.headers = self.login(is_admin=True)
     self.rolex = self.create_user(foreign_id='user_3')
     _, self.headers_x = self.login(foreign_id='user_3')
     self.fixture = self.get_fixture_path('experts.csv')
     self.content_hash = archive.archive_file(self.fixture)
     data = {
         'id': 'foo',
         'schema': 'Table',
         'properties': {
             'csvHash': self.content_hash,
             'contentHash': self.content_hash,
             'mimeType': 'text/csv',
             'fileName': 'experts.csv',
             'name': 'experts.csv'
         }
     }
     self.ent = EntityProxy.from_dict(model, data)
     self.ent.id = self.col.ns.sign(self.ent.id)
     index_proxy(self.col, self.ent)
     data = {
         'id': 'foo2',
         'schema': 'Table',
         'properties': {
             'csvHash': self.content_hash,
             'contentHash': self.content_hash,
             'mimeType': 'text/csv',
             'fileName': 'experts.csv',
             'name': 'experts.csv'
         }
     }
     self.ent2 = EntityProxy.from_dict(model, data)
     self.ent2.id = self.col.ns.sign(self.ent2.id)
     index_proxy(self.col, self.ent2)
     data = {
         'id': 'bar',
         'schema': 'LegalEntity',
         'properties': {
             'name': 'John Doe'
         }
     }
     ent = EntityProxy.from_dict(model, data)
     ent.id = self.col.ns.sign(ent.id)
     index_proxy(self.col, ent)
예제 #9
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    with NamedTemporaryFile() as fh:
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True)
        if res.status_code >= 400:
            log.error("Error ingesting %r: %r", url, res.status_code)
        for chunk in res.iter_content(chunk_size=1024):
            if chunk:
                fh.write(chunk)
        fh.flush()
        if not meta.has("source_url"):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = archive.archive_file(fh.name, meta, move=True)
        ingest.delay(source_id, meta.data)
예제 #10
0
def ingest_document(document, file_path, role_id=None, shallow=False):
    """Given a stub document and file path, extract information.
    This does not attempt to infer metadata such as a file name."""
    document.status = Document.STATUS_PENDING
    if not is_file(file_path):
        manager = get_manager()
        manager.ingest_document(document,
                                file_path=file_path,
                                role_id=role_id,
                                shallow=shallow)
    else:
        document.content_hash = archive.archive_file(file_path)
        db.session.commit()
        priority = 5 if document.collection.casefile else 3
        ingest.apply_async(args=[document.id],
                           kwargs={'role_id': role_id},
                           priority=priority)
예제 #11
0
def ingest_url(self, document_id, url):
    """Load the given URL into the document specified by document_id."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    tmp_path = make_tempfile(document.file_name, suffix=document.extension)
    try:
        log.info("Ingesting URL: %s", url)
        res = requests.get(url, stream=True)
        if res.status_code >= 500:
            countdown = 3600 ** self.request.retries
            self.retry(countdown=countdown)
            return
        if res.status_code >= 400:
            document.status = Document.STATUS_FAIL
            document.error_message = "HTTP %s: %s" % (res.status_code, url)
            db.session.commit()
            return
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not document.has_meta('source_url'):
            document.source_url = res.url
        if not document.foreign_id:
            document.foreign_id = res.url
        document.headers = res.headers
        document.content_hash = archive.archive_file(tmp_path)
        db.session.commit()
        get_manager().ingest_document(document)
    except IOError as ioe:
        log.info("IO Failure: %r", ioe)
        countdown = 3600 ** self.request.retries
        self.retry(countdown=countdown)
    except Exception as ex:
        document.status = Document.STATUS_FAIL
        document.error_type = type(ex).__name__
        document.error_message = six.text_type(ex)
        db.session.commit()
        log.exception(ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
예제 #12
0
def ingest_document(document, file_path, role_id=None):
    """Given a stub document and file path, extract information.
    This does not attempt to infer metadata such as a file name."""
    document.status = Document.STATUS_PENDING
    if role_id is not None:
        document.uploader_id = role_id

    if file_path is not None:
        # Directories cannot be archived first and then processed
        # later. So they are effectively sent into a short-cut here
        if os.path.isdir(file_path):
            db.session.commit()
            return ingest(document.id, file_path=file_path)
        document.content_hash = archive.archive_file(file_path)

    db.session.commit()
    priority = 5 if document.collection.casefile else 3
    ingest.apply_async(args=[document.id], priority=priority)
예제 #13
0
파일: __init__.py 프로젝트: kkrbalam/aleph
def ingest_document(document, file_path, role_id=None):
    """Given a stub document and file path, extract information.
    This does not attempt to infer metadata such as a file name."""
    document.status = Document.STATUS_PENDING
    if not is_file(file_path):
        manager = get_manager()
        manager.ingest_document(document, file_path=file_path, role_id=role_id)
    else:
        document.content_hash = archive.archive_file(file_path)
        db.session.commit()
        queue = WORKER_QUEUE
        routing_key = WORKER_ROUTING_KEY
        if role_id is not None:
            queue = USER_QUEUE
            routing_key = USER_ROUTING_KEY
        ingest.apply_async(args=[document.id],
                           kwargs={'role_id': role_id},
                           queue=queue,
                           routing_key=routing_key)
예제 #14
0
파일: documents.py 프로젝트: stofstar/aleph
def crawl_directory(collection, path, parent=None):
    """Crawl the contents of the given path."""
    content_hash = None
    if not path.is_dir():
        content_hash = archive.archive_file(path)
    foreign_id = path.name
    if parent is not None:
        foreign_id = os.path.join(parent.foreign_id, foreign_id)
    meta = {'file_name': path.name}
    document = Document.save(collection,
                             parent=parent,
                             foreign_id=foreign_id,
                             content_hash=content_hash,
                             meta=meta)
    db.session.commit()
    ingest_entity(collection, document.to_proxy())
    log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id)
    if path.is_dir():
        for child in path.iterdir():
            crawl_directory(collection, child, document)
예제 #15
0
파일: __init__.py 프로젝트: maquchizi/aleph
def ingest_file(collection_id,
                meta,
                file_path,
                move=False,
                queue=WORKER_QUEUE,
                routing_key=WORKER_ROUTING_KEY):
    # the queue and routing key arguments are a workaround to
    # expedite user uploads over long-running batch imports.
    file_path = string_value(file_path)
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = archive.archive_file(file_path, meta, move=move)
        ingest.apply_async([collection_id, meta.to_attr_dict()],
                           queue=queue,
                           routing_key=routing_key)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
예제 #16
0
def crawl_directory(collection, path, parent=None, job_id=None):
    """Crawl the contents of the given path."""
    try:
        content_hash = None
        if not path.is_dir():
            content_hash = archive.archive_file(path)
        foreign_id = path.name
        if parent is not None:
            foreign_id = os.path.join(parent.foreign_id, foreign_id)

        # if the job_id is not set yet and path.is_dir(), we know it is the
        # first iteration and we don't create an initial root folder as parent
        # to be consistent with the behaviour of alephclient
        if path.is_dir() and job_id is None:
            document = None
            job_id = Job.random_id()
        else:
            meta = {"file_name": path.name}
            document = Document.save(
                collection,
                parent=parent,
                foreign_id=foreign_id,
                content_hash=content_hash,
                meta=meta,
            )
            db.session.commit()
            job_id = job_id or Job.random_id()
            proxy = document.to_proxy()
            ingest_flush(collection, entity_id=proxy.id)
            ingest_entity(collection, proxy, job_id=job_id)
            log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id)

        if path.is_dir():
            for child in path.iterdir():
                crawl_directory(collection, child, document, job_id)
    except OSError:
        log.exception("Cannot crawl directory: %s", path)
예제 #17
0
 def setUp(self):
     super(ArchiveApiTestCase, self).setUp()
     self.fixture = self.get_fixture_path('samples/website.html')
     self.content_hash = archive.archive_file(self.fixture)
예제 #18
0
 def setUp(self):
     super(ArchiveApiTestCase, self).setUp()
     self.fixture = self.get_fixture_path('samples/website.html')
     self.content_hash = archive.archive_file(self.fixture)
예제 #19
0
 def setUp(self):
     super(ArchiveApiTestCase, self).setUp()
     self.fixture = self.get_fixture_path("samples/website.html")
     self.content_hash = archive.archive_file(self.fixture)
     self.fixture2 = self.get_fixture_path("samples/taggable.txt")
     self.content_hash2 = archive.archive_file(self.fixture2)
예제 #20
0
파일: ingest_api.py 프로젝트: x0rzkov/aleph
def ingest_upload(collection_id):
    """
    ---
    post:
      summary: Upload a document to a collection
      description: Upload a document to a collection with id `collection_id`
      parameters:
      - in: path
        name: collection_id
        required: true
        schema:
          type: integer
      requestBody:
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                file:
                  type: string
                  format: binary
                  description: The document to upload
                meta:
                  $ref: '#/components/schemas/DocumentIngest'
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                properties:
                  id:
                    description: id of the uploaded document
                    type: integer
                  status:
                    type: string
                type: object
      tags:
      - Ingest
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    job_id = get_session_id()
    sync = get_flag('sync', default=False)
    meta, foreign_id = _load_metadata()
    parent = _load_parent(collection, meta)
    upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.'))
    try:
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = upload_dir.joinpath(path)
            storage.save(str(path))
            content_hash = archive.archive_file(path)
        document = Document.save(collection=collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta,
                                 uploader_id=request.authz.id)
        collection.touch()
        db.session.commit()
        proxy = document.to_proxy()
        if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync:
            index_proxy(collection, proxy, sync=sync)
        ingest_entity(collection, proxy, job_id=job_id, sync=sync)
        document_id = collection.ns.sign(document.id)
        _notify(collection, document_id)
    finally:
        shutil.rmtree(upload_dir)

    return jsonify({'status': 'ok', 'id': document_id}, status=201)
예제 #21
0
파일: result.py 프로젝트: nt0z/aleph
 def emit_pdf_alternative(self, file_path):
     content_hash = archive.archive_file(file_path)
     self.document.pdf_version = content_hash
예제 #22
0
 def store_pdf(self, meta, pdf_path):
     archive.archive_file(pdf_path, meta.pdf, move=False)
예제 #23
0
파일: text.py 프로젝트: DavidLemayian/aleph
 def store_pdf(self, meta, pdf_path, move=True):
     archive.archive_file(pdf_path, meta.pdf, move=move)