def upload_to_path(self, request: Request, path: str,
                       content_length: int) -> Response:
        # TODO: This might not be secure (ok for now due to permissions check)
        upload_path = os.path.realpath(
            os.path.join(current_app.config["SERVICE_DATA"],
                         os.path.dirname(path),
                         secure_filename(os.path.basename(path))))
        if not os.path.realpath(
                os.path.join(
                    current_app.config["SERVICE_DATA"], path)).startswith(
                        os.path.realpath(current_app.config["SERVICE_DATA"])):
            # TODO: Mark against user
            return flask_bad_request_error(
                "Cannot upload outside of the drop box")

        if os.path.exists(upload_path):
            return flask_bad_request_error("Cannot upload to an existing path")

        try:
            os.makedirs(os.path.dirname(upload_path), exist_ok=True)
        except FileNotFoundError:  # blank dirname
            pass

        bytes_left = content_length
        with open(upload_path, "wb") as f:
            while bytes_left > 0:
                chunk = request.stream.read(4096)  # Chunk size: 4096
                f.write(chunk)
                bytes_left -= len(chunk)

        return current_app.response_class(status=204)
Beispiel #2
0
def object_ingest():
    data = request.json or {}

    obj_path: str = data.get("path")

    if not obj_path or not isinstance(obj_path, str):
        return flask_errors.flask_bad_request_error(
            "Missing or invalid path parameter in JSON request")

    # TODO: Should this always be the case?
    deduplicate: bool = data.get("deduplicate", False)

    drs_object: Optional[DrsObject] = None
    if deduplicate:
        # Get checksum of original file, and query database for objects that match
        checksum = drs_file_checksum(obj_path)
        drs_object = DrsObject.query.filter_by(checksum=checksum).first()

    if not drs_object:
        try:
            drs_object = DrsObject(location=obj_path)

            db.session.add(drs_object)
            db.session.commit()
        except Exception as e:  # TODO: More specific handling
            current_app.logger.error(
                f"[{SERVICE_NAME}] Encountered exception during ingest: {e}")
            return flask_errors.flask_bad_request_error(
                "Error while creating the object")

    response = build_object_json(drs_object)

    return response, 201
    def retrieve_from_path(self, path: str) -> Response:
        directory_items = self.get_directory_tree()

        # Otherwise, find the file if it exists and return it.
        path_parts = path.split("/")  # TODO: Deal with slashes in file names

        while len(path_parts) > 0:
            part = path_parts[0]
            path_parts = path_parts[1:]

            if part not in {item["name"] for item in directory_items}:
                return flask_not_found_error("Nothing found at specified path")

            try:
                node = next(item for item in directory_items
                            if item["name"] == part)

                if "contents" not in node:
                    if len(path_parts) > 0:
                        return flask_bad_request_error(
                            "Cannot retrieve a directory")

                    return send_file(node["path"],
                                     mimetype="application/octet-stream",
                                     as_attachment=True,
                                     download_name=node["name"])

                directory_items = node["contents"]

            except StopIteration:
                return flask_not_found_error("Nothing found at specified path")
Beispiel #4
0
def run_cancel(run_id):
    # TODO: Check if already completed
    # TODO: Check if run log exists
    # TODO: from celery.task.control import revoke; revoke(celery_id, terminate=True)
    db = get_db()
    c = db.cursor()
    event_bus = get_flask_event_bus()

    c.execute("SELECT * FROM runs WHERE id = ?", (str(run_id),))
    run = c.fetchone()

    if run is None:
        return flask_not_found_error(f"Run {run_id} not found")

    if run["state"] in (states.STATE_CANCELING, states.STATE_CANCELED):
        return flask_bad_request_error("Run already canceled")

    if run["state"] in states.FAILURE_STATES:
        return flask_bad_request_error("Run already terminated with error")

    if run["state"] in states.SUCCESS_STATES:
        return flask_bad_request_error("Run already completed")

    c.execute("SELECT * FROM run_logs WHERE id = ?", (run["run_log"],))
    run_log = c.fetchone()

    if run_log is None:
        return flask_internal_server_error(f"No run log present for run {run_id}")

    if run_log["celery_id"] is None:
        # Never made it into the queue, so "cancel" it
        return flask_internal_server_error(f"No Celery ID present for run {run_id}")

    # TODO: terminate=True might be iffy
    update_run_state_and_commit(db, c, event_bus, run["id"], states.STATE_CANCELING)
    celery.control.revoke(run_log["celery_id"], terminate=True)  # Remove from queue if there, terminate if running

    # TODO: wait for revocation / failure and update status...

    # TODO: Generalize clean-up code / fetch from back-end
    run_dir = os.path.join(current_app.config["SERVICE_TEMP"], run["run_id"])
    shutil.rmtree(run_dir, ignore_errors=True)

    update_run_state_and_commit(db, c, event_bus, run["id"], states.STATE_CANCELED)

    return current_app.response_class(status=204)  # TODO: Better response
Beispiel #5
0
def object_search():
    response = []
    name = request.args.get("name")
    fuzzy_name = request.args.get("fuzzy_name")

    if name:
        objects = DrsObject.query.filter_by(name=name).all()
    elif fuzzy_name:
        objects = DrsObject.query.filter(
            DrsObject.name.contains(fuzzy_name)).all()
    else:
        return flask_errors.flask_bad_request_error(
            "Missing GET search terms (either name or fuzzy_name)")

    for obj in objects:
        response.append(build_object_json(obj))

    return jsonify(response)
Beispiel #6
0
def drop_box_retrieve(path):
    # Werkzeug's default is to encode URL to latin1
    # in case we have unicode characters in the filename
    try:
        path = path.encode('iso-8859-1').decode('utf8')
    except UnicodeDecodeError:
        pass

    backend = get_backend()

    if backend is None:
        return flask_internal_server_error(
            "The service source data is not configured properly")

    if request.method == "PUT":
        content_length = int(request.headers.get("Content-Length", "0"))
        if content_length == 0:
            return flask_bad_request_error(
                "No file provided or no/zero content length specified")
        return backend.upload_to_path(request, path, content_length)

    return backend.retrieve_from_path(path)
Beispiel #7
0
def _create_run(db, c):
    try:
        assert "workflow_params" in request.form
        assert "workflow_type" in request.form
        assert "workflow_type_version" in request.form
        assert "workflow_engine_parameters" in request.form
        assert "workflow_url" in request.form
        assert "tags" in request.form

        workflow_params = json.loads(request.form["workflow_params"])
        workflow_type = request.form["workflow_type"].upper().strip()
        workflow_type_version = request.form["workflow_type_version"].strip()
        workflow_engine_parameters = json.loads(request.form["workflow_engine_parameters"])  # TODO: Unused
        workflow_url = request.form["workflow_url"].lower()  # TODO: This can refer to an attachment
        workflow_attachment_list = request.files.getlist("workflow_attachment")  # TODO: Use this fully
        tags = json.loads(request.form["tags"])

        # TODO: Move CHORD-specific stuff out somehow?

        # Only "turn on" CHORD-specific features if specific tags are present

        chord_mode = all((
            "workflow_id" in tags,
            "workflow_metadata" in tags,

            # Allow either a path to be specified for ingestion (for the 'classic'
            # Bento singularity architecture) or
            "ingestion_path" in tags or "ingestion_url" in tags,

            "table_id" in tags,
        ))

        workflow_id = tags.get("workflow_id", workflow_url)
        workflow_metadata = tags.get("workflow_metadata", {})
        workflow_ingestion_path = tags.get("ingestion_path", None)
        workflow_ingestion_url = tags.get(
            "ingestion_url",
            (f"http+unix://{current_app.config['NGINX_INTERNAL_SOCKET']}{workflow_ingestion_path}"
             if workflow_ingestion_path else None))
        table_id = tags.get("table_id", None)

        # Don't accept anything (ex. CWL) other than WDL TODO: CWL support
        assert workflow_type == "WDL"
        assert workflow_type_version == "1.0"

        assert isinstance(workflow_params, dict)
        assert isinstance(workflow_engine_parameters, dict)
        assert isinstance(tags, dict)

        if chord_mode:
            table_id = str(uuid.UUID(table_id))  # Check and standardize table ID

        # TODO: Use JSON schemas for workflow params / engine parameters / tags

        # Get list of allowed workflow hosts from configuration for any checks inside the runner
        # If it's blank, assume that means "any host is allowed" and pass None to the runner
        workflow_host_allow_list = parse_workflow_host_allow_list(current_app.config["WORKFLOW_HOST_ALLOW_LIST"])

        # Download workflow file (potentially using passed auth headers, if
        # present and we're querying ourself)

        # TODO: Move this back to runner, since we'll need to handle the callback anyway with local URLs...

        chord_url = current_app.config["CHORD_URL"]

        wm = WorkflowManager(
            current_app.config["SERVICE_TEMP"],
            chord_url,
            logger=current_app.logger,
            workflow_host_allow_list=workflow_host_allow_list,
            debug=current_app.config["BENTO_DEBUG"],
        )

        # Optional Authorization HTTP header to forward to nested requests
        # TODO: Move X-Auth... constant to bento_lib
        auth_header = request.headers.get("X-Authorization", request.headers.get("Authorization"))
        auth_header_dict = {"Authorization": auth_header} if auth_header else {}

        try:
            wm.download_or_copy_workflow(
                workflow_url,
                WorkflowType(workflow_type),
                auth_headers=auth_header_dict)
        except UnsupportedWorkflowType:
            return flask_bad_request_error(f"Unsupported workflow type: {workflow_type}")
        except (WorkflowDownloadError, requests.exceptions.ConnectionError) as e:
            return flask_bad_request_error(f"Could not access workflow file: {workflow_url} (Python error: {e})")

        # Generate one-time tokens for ingestion purposes if in Bento mode

        one_time_tokens = []
        drs_url: str = current_app.config["DRS_URL"]
        use_otts_for_drs: bool = chord_url in drs_url and urlparse(drs_url).scheme != "http+unix"
        ott_endpoint_namespace: str = current_app.config["OTT_ENDPOINT_NAMESPACE"]  # TODO: py3.9: walrus operator
        if chord_mode and ott_endpoint_namespace:
            # Generate the correct number of one-time tokens for the DRS and ingest scopes
            # to allow for the callback to ingest files
            # Skip doing this for DRS if the DRS URL is an internal UNIX socket / internal Docker URL
            # TODO: Remove this ^ bit and pull the plug on socket requests
            # TODO: Refactor into class

            headers = {**auth_header_dict}  # TODO: Host?
            ott_generate_url = urljoin(ott_endpoint_namespace.rstrip("/") + "/", "generate")

            if use_otts_for_drs:
                scope = f"/{drs_url.replace(chord_url, '').rstrip('/')}/"
                tr = requests.post(ott_generate_url, headers=headers, json={
                    # TODO: This sort of assumes DRS is on the same domain as WES, which isn't necessarily correct
                    #  An error should be thrown if there's a mismatch and we're still trying to do OTT stuff, probably
                    "scope": scope,
                    "number": count_bento_workflow_file_outputs(workflow_id, workflow_params, workflow_metadata),
                }, verify=not current_app.config["BENTO_DEBUG"])

                if not tr.ok:
                    # An error occurred while requesting OTTs, so we cannot complete the run request
                    return flask_internal_server_error(
                        f"Got error while requesting one-time-use tokens for DRS: {tr.content} "
                        f"(Scope: {scope}, OTT URL: {ott_generate_url}, headers included: {list(headers.keys())})")

                one_time_tokens.extend(tr.json())

            # Request an additional OTT for the service ingest request
            scope = ("/" if chord_url in workflow_ingestion_url else "") + workflow_ingestion_url.replace(
                chord_url, "").rsplit("/", 1)[0] + "/"
            tr = requests.post(ott_generate_url, headers=headers, json={
                # TODO: This sort of assumes the ingest URL is on the same domain as WES, which isn't necessarily
                #  correct. An error should be thrown if there's a mismatch and we're still trying to do OTT stuff
                "scope": scope,
                "number": 1,
            }, verify=not current_app.config["BENTO_DEBUG"])

            if not tr.ok:
                # An error occurred while requesting OTTs, so we cannot complete the run request
                return flask_internal_server_error(
                    f"Got error while requesting one-time-use tokens for ingestion URL: {tr.content} "
                    f"(Scope: {scope}, OTT URL: {ott_generate_url}, headers included: {list(headers.keys())})")

            one_time_tokens.extend(tr.json())

        # Begin creating the job after validating the request

        req_id = uuid.uuid4()
        run_id = uuid.uuid4()
        log_id = uuid.uuid4()

        # Create run directory

        run_dir = os.path.join(current_app.config["SERVICE_TEMP"], str(run_id))

        if os.path.exists(run_dir):
            return flask_internal_server_error("UUID collision")

        os.makedirs(run_dir, exist_ok=True)
        # TODO: Delete run dir if something goes wrong...

        # Move workflow attachments to run directory

        for attachment in workflow_attachment_list:
            # TODO: Check and fix input if filename is non-secure
            # TODO: Do we put these in a subdirectory?
            # TODO: Support WDL uploads for workflows
            attachment.save(os.path.join(run_dir, secure_filename(attachment.filename)))

        # Will be updated to STATE_ QUEUED once submitted
        c.execute("INSERT INTO run_requests (id, workflow_params, workflow_type, workflow_type_version, "
                  "workflow_engine_parameters, workflow_url, tags) VALUES (?, ?, ?, ?, ?, ?, ?)",
                  (str(req_id), json.dumps(workflow_params), workflow_type, workflow_type_version,
                   json.dumps(workflow_engine_parameters), workflow_url, json.dumps(tags)))
        c.execute("INSERT INTO run_logs (id, name) VALUES (?, ?)", (str(log_id), workflow_id))
        c.execute("INSERT INTO runs (id, request, state, run_log, outputs) VALUES (?, ?, ?, ?, ?)",
                  (str(run_id), str(req_id), states.STATE_UNKNOWN, str(log_id), json.dumps({})))
        db.commit()

        # TODO: figure out timeout
        # TODO: retry policy
        c.execute("UPDATE runs SET state = ? WHERE id = ?", (states.STATE_QUEUED, str(run_id)))
        db.commit()

        run_workflow.delay(run_id, chord_mode, workflow_metadata, workflow_ingestion_url, table_id, one_time_tokens,
                           use_otts_for_drs)

        return jsonify({"run_id": str(run_id)})

    except ValueError:
        return flask_bad_request_error("Value error")

    except AssertionError:  # TODO: Better error messages
        logger.error(f"Encountered assertion error: {traceback.format_exc()}")
        return flask_bad_request_error("Assertion error: bad run request format")