def add_inputs(run_id: str, inputs: List[dict], token) -> str:
    """
    Updates the analysis record with the inputs.

    Arguments:
        run_id {str} -- Record ID of the run.
        inputs {List[dict]} -- List of files used.
        token {str} -- JWT

    Returns:
        str -- Updated _etag.
    """
    try:
        results = EVE.get(endpoint="analysis", item_id=run_id,
                          token=token).json()
        etag = results["_etag"]
        return EVE.patch(
            endpoint="analysis",
            item_id=run_id,
            _etag=etag,
            token=token,
            json={
                "files_used": inputs
            },
        ).json()["_etag"]
    except RuntimeError as rte:
        log_formatted(
            logging.error,
            "Failed to upate analysis with inputs: %s" % str(rte),
            "ERROR-CELERY-SNAKEMAKE",
        )
Beispiel #2
0
def lock_trial(trial_id: str, admin: str) -> bool:
    """
    Locks a trial and associated assets so they cannot be used.

    Arguments:
        trial_id {str} -- ID of the trial to be locked.
        admin {str} -- Email of the admin requesting the lock.

    Returns:
        bool -- [description]
    """
    token = lock_trial.token["access_token"]
    # Patch the trial to be locked.
    try:
        etag = EVE_FETCHER.get(endpoint="trials", item_id=trial_id, token=token)
        EVE_FETCHER.patch(
            endpoint="trials",
            item_id=trial_id,
            _etag=etag,
            token=token,
            json={"locked": True},
        )
        log_formatted(
            logging.info,
            "Administrator %s locked trial %s" % (admin, trial_id),
            "INFO-CELERY-TRIALLOCK",
        )
    except RuntimeError as rte:
        log_formatted(
            logging.error,
            "Failed to lock trial:  %s , Error: %s" % (trial_id, str(rte)),
            "ERROR-CELERY-API",
        )
def find_valid_runs(record_response: dict,
                    assay_dict: dict) -> List[Tuple[dict, dict]]:
    """
    Checks to see if any runs are possible.

    Arguments:
        record_response {dict} -- Return from the aggregation query of the /data endpoint.
        assay_dict {dict} -- Dictionary keyed on assay_id containing assay info.

    Returns:
        List[dict] -- Tuple of a file group, and the assay which it should run.
    """
    valid_runs = []
    for grouping in record_response:
        assay_id = grouping["_id"]["assay"]
        try:
            mappings_present = [
                record["mapping"] for record in grouping["records"]
            ]
            if set(mappings_present) == set(
                    assay_dict[assay_id]["non_static_inputs"]):
                # If mappings are 1:1, a run may start.
                valid_runs.append((grouping, assay_dict[assay_id]))
        except KeyError:
            log_formatted(
                logging.error,
                "A record with an invalid assay id was found",
                "ERROR-CELERY-SNAKEMAKE",
            )
            return None
    return valid_runs
def update_child_list(record_response: dict, endpoint: str, parent_id: str) -> dict:
    """
    Adds item to a record's child list.

    Arguments:
        record_response {dict} -- Response to POST of new child.
        endpoint {str} -- Resource endpoint of record.
        parent_id {str} -- id of parent record.

    Raises:
        RuntimeError -- [description]

    Returns:
        dict -- HTTP response to patch.
    """
    records = None

    if "_items" in record_response:
        records = record_response["_items"]
    else:
        records = [record_response]

    new_children = []
    for record in records:
        new_children.append({"_id": record["_id"], "resource": endpoint})

    try:
        # Get etag of parent.
        parent = EVE_FETCHER.get(
            endpoint="data?where=%s" % json.dumps({"_id": parent_id}),
            token=process_file.token["access_token"],
        ).json()

        # Update parent record.
        return EVE_FETCHER.patch(
            endpoint="data_edit",
            item_id=str(parent["_items"][0]["_id"]),
            json={"children": parent["_items"][0]["children"] + new_children},
            _etag=parent["_items"][0]["_etag"],
            token=process_file.token["access_token"],
        )
    except RuntimeError as code_error:
        if not parent:
            log_formatted(
                logging.error,
                "Error fetching parent %s" % str(code_error),
                "ERROR-CELERY-GET",
            )
        else:
            log_formatted(
                logging.error,
                "Error updating child list of parent: %s" % str(code_error),
                "ERROR-CELERY-PATCH-FAIR",
            )
        return None
def check_for_runs(token: str) -> Tuple[List[dict], List[dict]]:
    """
    Checks to see if any runs can start

    Arguments:
        token {str} -- JWT for API.

    Returns:
        Tuple[List[dict], List[dict]] -- Tuple, where each item is a data/assay pairing
        representing a run that can be started.
    """

    try:
        # Only return assays that have a workflow associated with them
        assay_query = {"workflow_location": {"$ne": "null"}}
        assay_query_string = "assays?where=%s" % json.dumps(assay_query)

        # Contains a list of all the running assays and their inputs
        assay_response = EVE.get(token=token,
                                 endpoint=assay_query_string,
                                 code=200).json()["_items"]
        sought_mappings = [
            item
            for sublist in [x["non_static_inputs"] for x in assay_response]
            for item in sublist
        ]
        query_string = "data/query?aggregate=%s" % (json.dumps(
            {"$inputs": sought_mappings}))
        record_response = EVE.get(token=token, endpoint=query_string, code=200)
        # Create an assay id keyed dictionary to simplify searching.
        assay_dict = {
            assay["_id"]: {
                "non_static_inputs": assay["non_static_inputs"],
                "assay_name": assay["assay_name"],
                "workflow_location": assay["workflow_location"],
            }
            for assay in assay_response
        }
        return record_response, assay_dict
    except RuntimeError as rte:
        try:
            str(query_string)
            error_msg = "Failed to fetch record from data collection: %s" % str(
                rte)
        except NameError:
            error_msg = "Failed to fetch record from assays collection: %s" % str(
                rte)
        log_formatted(logging.error, error_msg, "ERROR-CELERY-QUERY")
        return None
def delete_from_bucket(gs_uri: str) -> None:
    """
    Deletes a google clouds torage object with the given path.

    Arguments:
        gs_uri {str} -- Path of file to be deleted.

    Returns:
        None -- [description]
    """
    gsutil_args = ["gsutil", "rm", gs_uri]
    run_subprocess_with_logs(gsutil_args, message="Deleting Object: %s")
    log_formatted(
        logging.info, "Finished deleting object: %s" % gs_uri, "FAIR-CELERY-DELETE"
    )
Beispiel #7
0
def serialize_objectids(items: List[dict]) -> None:
    """
    Transforms the ID strings into ObjectID objects for propper mapping.

    Arguments:
        items {List[dict]} -- List of data records.
    """
    for record in items:
        record["assay"] = ObjectId(record["assay"])
        record["trial"] = ObjectId(record["trial"])
        record["processed"] = False
        record["visibility"] = True
        record["children"] = []
        log = "Record: %s, for Trial: %s, in Assay: %s uploaded" % (
            record["file_name"],
            str(record["trial"]),
            str(record["assay"]),
        )
        log_formatted(logging.info, log, "INFO-EVE-DATA")
def postprocessing(records: List[dict]) -> None:
    """
    Scans incoming records and sees if they need post-processing.
    If so, they are processed and then uploaded to mongo.

    Arguments:
        records {List[dict]} -- List of records slated for processing.
    """
    tasks = []

    for rec in records:
        message = "Processing: " + rec["file_name"]
        logging.info({"message": message, "category": "FAIR-CELERY-PROCESSING"})
        for pro in PROC:
            if re.search(PROC[pro]["re"], rec["file_name"], re.IGNORECASE):
                log_formatted(
                    logging.info,
                    "Match found for " + rec["file_name"],
                    "INFO-CELERY-PROCESSING",
                )
                # If a match is found, add to job queue
                tasks.append(process_file.s(rec, pro))

    execute_in_parallel(tasks)
def tail_logs(logs: List[str]) -> List[str]:
    """
    Takes a list of log uris and returns their tails concat into one.

    Arguments:
        logs {List[str]} -- [description]
    """
    log_list: List[str] = []
    for i, uri in enumerate(logs):
        try:
            filename: str = "tmp%s" % str(i)
            gsutil_args = ["gsutil", "cp", "gs://" + uri, filename]
            run_subprocess_with_logs(gsutil_args,
                                     message="Copying logs for tailing")
            last_lines = str.join("", tail(filename, lines=50))
            os.remove(filename)
            log_list.append(last_lines)
        except FileNotFoundError:
            log_formatted(
                logging.warning,
                "No log found for gs_uri: %s. This may be the result of a failed pipeline.",
                "WARNING-CELERY-SNAKEMAKE",
            )
    return log_list
def process_file(rec: dict, pro: str) -> bool:
    """
    Worker process that handles processing an individual file.

    Arguments:
        rec {dict} -- A record item to be processed.
        pro {str} -- Key in the processing dictionary that the filetype
        corresponds to.

    Returns:
        boolean -- True if completed without error, else false.
    """
    temp_file_name = str(uuid4())
    gs_args = ["gsutil", "cp", rec["gs_uri"], temp_file_name]
    subprocess.run(gs_args)
    records = None
    response = None

    try:
        # Use a random filename as tasks are executing in parallel.
        if PROC[pro]["mongo"]:
            records = PROC[pro]["func"](
                temp_file_name,
                RecordContext(
                    rec["trial"]["$oid"], rec["assay"]["$oid"], rec["_id"]["$oid"]
                ),
            )
        else:
            PROC[pro]["func"](
                temp_file_name,
                RecordContext(
                    rec["trial"]["$oid"], rec["assay"]["$oid"], rec["_id"]["$oid"], rec
                ),
            )
        remove(temp_file_name)
    except OSError:
        pass

    if not PROC[pro]["mongo"]:
        return True

    if not records:
        return False

    try:
        # First try a normal upload
        response = EVE_FETCHER.post(
            endpoint=pro,
            token=process_file.token["access_token"],
            code=201,
            json=records,
        )

        update_child_list(response.json(), pro, rec["_id"]["$oid"])

        # Simplify handling later
        if not isinstance(records, (list,)):
            records = [records]

        # Record uploads.
        log_record_upload(records, pro)
        return True
    except RuntimeError as rte:
        # Catch unexpected error codes.
        if "422" not in str(rte):
            log_formatted(
                logging.error,
                "Upload failed for reasons other than validation: %s" % str(rte),
                "ERROR-CELERY-UPLOAD",
            )
            return False

        # Extract Cerberus errors from response.
        new_upload = report_validation_issues(response, records)
        if not new_upload:
            # Handle data types that haven't used the new schema yet.
            logging.warning(
                {
                    "message": (
                        "Validation error detected in upload, but schema not adapted to"
                        "handle error reports"
                    ),
                    "category": "WARNING-CELERY-UPLOAD",
                }
            )
            return False
        try:
            # Try to upload just the errors.
            errors = EVE_FETCHER.post(
                endpoint=pro,
                token=process_file.token["access_token"],
                code=201,
                json=new_upload,
            )

            # Still update parent to tie to error documents.
            update_child_list(errors.json(), pro, rec["_id"]["$oid"])
            log_record_upload(new_upload, pro)
        except RuntimeError as rte:
            # If that fails, something on my end is wrong.
            log = "Upload of validation errors failed. %s" % str(rte)
            logging.error({"message": log, "category": "ERROR-CELERY-UPLOAD"})
    return False
def process_maf(path: str, context: RecordContext) -> bool:
    """
    Processes a new maf file to update the combined maf.

    Arguments:
        path {str} -- On disk path of the file.
        context {RecordContext} -- Class holding trial, assay, and a copy of the record.

    Returns:
        bool -- True if finished without error, else false.
    """
    if context.full_record["file_name"] == "combined.maf":
        return True

    token = process_file.token["access_token"]
    # Check if a combined maf already exists.
    query_string = "data?where=%s" % json.dumps(
        {"trial": context.trial, "assay": context.assay, "file_name": "combined.maf"}
    )

    extant_combined_maf = EVE_FETCHER.get(endpoint=query_string, token=token).json()[
        "_items"
    ]

    if not extant_combined_maf:
        create_new_maf(path, context, token)
        return True

    combined_maffile = extant_combined_maf[0]
    maf_gs_uri = combined_maffile["gs_uri"]
    combined_file_name = str(uuid4())
    run_subprocess_with_logs(
        ["gsutil", "mv", maf_gs_uri, combined_file_name],
        message="Copying combined maf file %s" % maf_gs_uri,
    )

    # Combine mafs.
    combine_mafs(path, combined_file_name)

    # Overwrite old file.
    run_subprocess_with_logs(
        ["gsutil", "mv", combined_file_name, maf_gs_uri],
        message="Overwriting old combined.maf: %s" % maf_gs_uri,
    )
    log_formatted(
        logging.info,
        "Overwriting old combined.maf: %s" % maf_gs_uri,
        "FAIR-CELERY-COMBINEDMAF",
    )
    new_sample_ids = combined_maffile["sample_ids"] + context.full_record["sample_ids"]

    # Patch data to include new samples.
    try:
        EVE_FETCHER.patch(
            endpoint="data_edit",
            item_id=combined_maffile["_id"],
            _etag=combined_maffile["_etag"],
            json={
                "sample_ids": new_sample_ids,
                "number_of_samples": len(new_sample_ids),
            },
            token=token,
        )
        return True
    except RuntimeError as rte:
        if "412" in str(rte):
            time.sleep(5)
            process_maf(path, context)
            return True

        log_formatted(
            logging.error,
            "Failed to edit combined.maf: %s" % str(rte),
            "ERROR-CELERY-PATCH",
        )
        return False
def create_new_maf(path: str, context: RecordContext, token: str) -> bool:
    """
    Create a new combined MAF.

    Arguments:
        path {str} -- path to file
        context {RecordContext} -- Object holding the record.

    Returns:
        bool -- True if succeeds, else false.
    """
    new_record = context.full_record

    # Check for other analyses that finished recently.
    an_query = "analysis?where=%s" % json.dumps(
        {"trial": context.trial, "assay": context.assay, "status": "Completed"}
    )
    analysis_runs = EVE_FETCHER.get(endpoint=an_query, token=token).json()["_items"]
    isotime = datetime.datetime.now(datetime.timezone.utc)

    my_end = 0
    my_run_id = None
    for run in analysis_runs:
        for output in run["files_generated"]:
            if output["file_name"] == new_record["file_name"]:
                my_end = parser.parse(run["end_date"])
                my_run_id = run["_id"]

    # If the time is close enough that the trial might not have cleaned up yet,
    # wait, then retry.
    for run in analysis_runs:
        end = parser.parse(run["end_date"])
        delta = (isotime - end).total_seconds()
        if run["_id"] != my_run_id and delta < 30:
            log_formatted(
                logging.info,
                "Another run ended recently, running checks to avoid race condition.",
                "INFO-CELERY-MAF",
            )
            if (
                my_end > end
                or my_end == end
                and min(my_run_id, run["_id"]) == my_run_id
            ):
                log_formatted(
                    logging.info,
                    "Detecting an older finished run. Sleeping",
                    "INFO-CELERY-SLEEP",
                )
                time.sleep(5)
                process_maf(path, context)
                return True

    # Rename and create a new data entry.
    reformat_maf(new_record, context)

    # Generate new alias for combined maf.
    new_record["gs_uri"] = (
        new_record["gs_uri"].replace(new_record["file_name"], "") + "/combined.maf"
    )
    run_subprocess_with_logs(
        ["gsutil", "mv", path, new_record["gs_uri"]],
        message="Copying new combined.maf to google bucket: %s" % new_record["gs_uri"],
    )

    try:
        remove(path)
    except FileNotFoundError:
        pass

    log_formatted(
        logging.info,
        "Copying for new combined.maf: %s complete." % new_record["gs_uri"],
        "FAIR-CELERY-NEWCOMBINEDMAF",
    )

    try:
        EVE_FETCHER.post(endpoint="data_edit", token=token, json=new_record, code=201)
        return True
    except RuntimeError as rte:
        log_formatted(
            logging.error,
            "Failed to create new combined.maf: %s" % str(rte),
            "ERROR-CELERY-POST",
        )
        return False
def execute_workflow(valid_run: Tuple[dict, dict]):
    """
    Create inputs, run snakemake, report results.

    Arguments:
        valid_run {Tuple[dict, dict]} -- Tuple of (aggregation result, assay info)
    """
    # Check that all records are unprocessed.
    token = execute_workflow.token["access_token"]
    records, all_free = check_processed(valid_run[0]["records"], token)
    analysis_response = register_analysis(valid_run, token)
    if not all_free:
        logging.error({
            "message": "Some input files are processed, check celery logic!",
            "category": "ERROR-CELERY-SNAKEMAKE",
        })
        return False
    logging.info({
        "message": "Setting files to processed",
        "category": "INFO-CELERY-SNAKEMAKE"
    })
    set_record_processed(records, True, token)

    # reference shortcuts
    aggregation_res = valid_run[0]["_id"]
    cimac_sample_id: str = aggregation_res["sample_ids"][0]
    run_id: str = str(analysis_response["_id"])

    snakemake_file = clone_snakemake(valid_run[1]["workflow_location"], run_id)
    analysis_response["files_used"] = create_input_json(
        valid_run[0]["records"], run_id, cimac_sample_id)
    analysis_response["_etag"] = add_inputs(run_id,
                                            analysis_response["files_used"],
                                            token)

    # Run Snakefile.
    try:
        workflow_dag, problem = run_snakefile(snakemake_file, workdir=run_id)
        if problem:
            logging.error({
                "message": "Snakemake run failed!",
                "category": "ERROR-CELERY-SNAKEMAKE",
            })
            records, all_free = check_processed(valid_run[0]["records"], token)
            set_record_processed(records, False, token)
            error_str = str(problem)
            logging.error({
                "message": error_str,
                "category": "ERROR-CELERY-SNAKEMAKE"
            })
            send_mail(
                "Snakemake Pipeline Failed",
                "The snakemake pipeline failed with message: %s" % error_str,
                ["*****@*****.**"],
                "*****@*****.**",
                SENDGRID_API_KEY,
            )
            return False
        return True
    except Exception as excp:
        log_formatted(
            logging.error,
            "Uncaught Snakemake Exception: %s. Run has failed." % str(excp),
            "ERROR-CELERY-SNAKEMAKE",
        )
        problem = str(excp)
    finally:
        update_analysis(valid_run,
                        token,
                        analysis_response,
                        workflow_dag,
                        problem=problem)

    return False
def update_analysis(valid_run: Tuple[dict, dict],
                    token: str,
                    analysis: dict,
                    dag,
                    problem=None) -> bool:
    """
    Update analysis record with results when the analysis has completed.

    Arguments:
        valid_run {Tuple[dict, dict]} -- Information about the run.
        token {str} -- JWT
        analysis {dict} -- Analysis id, _etag, input files.
        dag {snakemake.dag.DAG} -- DAG objet from the finished run.
        problem {str} -- Error message, string "Error" or None.

    Returns:
        bool -- True if update completes without error, else false.
    """
    payload = valid_run[0]["_id"]
    payload["workflow_location"] = valid_run[1]["workflow_location"]

    # Handle the case of failure to generate a DAG
    if dag:
        jobs, outputs = analyze_jobs(dag)
        payload["jobs"] = jobs
    else:
        payload["jobs"] = []

    snake_log_info = upload_snakelogs(analysis["_id"])
    payload["snakemake_logs"] = snake_log_info["log_locations"]
    payload["snakemake_log_tails"] = snake_log_info["log_tails"]
    payload["files_used"] = analysis["files_used"]
    payload["end_date"] = datetime.datetime.now(
        datetime.timezone.utc).isoformat()

    # If job succeeds, upload files.
    if outputs and not problem:
        payload["files_generated"] = upload_results(valid_run, outputs, token)
        if not payload["files_generated"]:
            payload["status"] = "Failed"
            payload[
                "error_message"] = "The pipeline failed to upload the results to the database."
            payload.__delitem__("files_generated")

    if problem:
        payload["status"] = "Failed"
        if problem == "Error":
            payload["error_message"] = (
                "One of the jobs experienced an error," +
                "please consult the logs for more information")
        else:
            payload["error_message"] = "Snakefile Error: %s" % str(problem)
    else:
        payload["status"] = "Completed"

    # some upload function.
    try:
        EVE.patch(
            endpoint="analysis",
            token=token,
            _etag=analysis["_etag"],
            item_id=analysis["_id"],
            json=payload,
            code=200,
        )
    except RuntimeError as rte:
        log_formatted(
            logging.error,
            "Analysis update failed: %s.\nPayload:%s" % (str(rte), payload),
            "ERROR-CELERY-PATCH",
        )
        return False
    return True
def upload_results(valid_run: dict, outputs: List[str],
                   token: str) -> List[dict]:
    """
    Upload the output files from a run.

    Arguments:
        valid_run {dict} -- Run information.
        outputs {List[str]} -- List of GS-uris of output files.
        token {str} -- JWT

    Returns:
        List[dict] -- List of inserted records.
    """
    payload = []
    bucket = storage.Client().get_bucket(GOOGLE_BUCKET_NAME)
    aggregation_res = valid_run[0]["_id"]
    for output in outputs:
        prefix = output.replace(GOOGLE_BUCKET_NAME + "/", "")
        logging.info({"message": prefix, "category": "PREFIX-EBUG"})
        output_file = [item for item in bucket.list_blobs(prefix=prefix)][0]
        payload.append({
            "data_format":
            get_data_format(output),
            "date_created":
            datetime.datetime.now(datetime.timezone.utc).isoformat(),
            "file_name":
            output_file.name.split("/")[-1],
            "uuid_alias":
            "unaliased",
            "file_size":
            output_file.size,
            "sample_ids":
            aggregation_res["sample_ids"],
            "number_of_samples":
            len(aggregation_res["sample_ids"]),
            "trial":
            aggregation_res["trial"],
            "trial_name":
            aggregation_res["trial_name"],
            "assay":
            aggregation_res["assay"],
            "gs_uri":
            "gs://%s/%s" % (GOOGLE_BUCKET_NAME, output_file.name),
            "mapping":
            "",
            "experimental_strategy":
            aggregation_res["experimental_strategy"],
            "processed":
            False,
            "visibility":
            True,
            "children": [],
        })
        authorized_users = get_authorized_users(
            {
                "assay": aggregation_res["assay"],
                "trial": aggregation_res["trial"]
            },
            token,
        )
        manage_bucket_acl(GOOGLE_BUCKET_NAME, payload[-1]["gs_uri"],
                          authorized_users)
    try:
        inserts = EVE.post(endpoint="data_edit",
                           code=201,
                           token=token,
                           json=payload).json()["_items"]
        logging.info({
            "message": "Records created for run outputs",
            "category": "FAIR-CELERY-SNAKEMAKE",
        })
        return [{
            "data_id": x["_id"],
            "gs_uri": y["gs_uri"],
            "file_name": y["file_name"],
            "data_format": y["data_format"],
        } for x, y in zip(inserts, payload)]
    except RuntimeError as rte:
        log_formatted(
            logging.error,
            "Upload of run outputs failed: %s" % str(rte),
            "ERROR-CELERY-UPLOAD",
        )
        return None