def add_inputs(run_id: str, inputs: List[dict], token) -> str: """ Updates the analysis record with the inputs. Arguments: run_id {str} -- Record ID of the run. inputs {List[dict]} -- List of files used. token {str} -- JWT Returns: str -- Updated _etag. """ try: results = EVE.get(endpoint="analysis", item_id=run_id, token=token).json() etag = results["_etag"] return EVE.patch( endpoint="analysis", item_id=run_id, _etag=etag, token=token, json={ "files_used": inputs }, ).json()["_etag"] except RuntimeError as rte: log_formatted( logging.error, "Failed to upate analysis with inputs: %s" % str(rte), "ERROR-CELERY-SNAKEMAKE", )
def lock_trial(trial_id: str, admin: str) -> bool: """ Locks a trial and associated assets so they cannot be used. Arguments: trial_id {str} -- ID of the trial to be locked. admin {str} -- Email of the admin requesting the lock. Returns: bool -- [description] """ token = lock_trial.token["access_token"] # Patch the trial to be locked. try: etag = EVE_FETCHER.get(endpoint="trials", item_id=trial_id, token=token) EVE_FETCHER.patch( endpoint="trials", item_id=trial_id, _etag=etag, token=token, json={"locked": True}, ) log_formatted( logging.info, "Administrator %s locked trial %s" % (admin, trial_id), "INFO-CELERY-TRIALLOCK", ) except RuntimeError as rte: log_formatted( logging.error, "Failed to lock trial: %s , Error: %s" % (trial_id, str(rte)), "ERROR-CELERY-API", )
def find_valid_runs(record_response: dict, assay_dict: dict) -> List[Tuple[dict, dict]]: """ Checks to see if any runs are possible. Arguments: record_response {dict} -- Return from the aggregation query of the /data endpoint. assay_dict {dict} -- Dictionary keyed on assay_id containing assay info. Returns: List[dict] -- Tuple of a file group, and the assay which it should run. """ valid_runs = [] for grouping in record_response: assay_id = grouping["_id"]["assay"] try: mappings_present = [ record["mapping"] for record in grouping["records"] ] if set(mappings_present) == set( assay_dict[assay_id]["non_static_inputs"]): # If mappings are 1:1, a run may start. valid_runs.append((grouping, assay_dict[assay_id])) except KeyError: log_formatted( logging.error, "A record with an invalid assay id was found", "ERROR-CELERY-SNAKEMAKE", ) return None return valid_runs
def update_child_list(record_response: dict, endpoint: str, parent_id: str) -> dict: """ Adds item to a record's child list. Arguments: record_response {dict} -- Response to POST of new child. endpoint {str} -- Resource endpoint of record. parent_id {str} -- id of parent record. Raises: RuntimeError -- [description] Returns: dict -- HTTP response to patch. """ records = None if "_items" in record_response: records = record_response["_items"] else: records = [record_response] new_children = [] for record in records: new_children.append({"_id": record["_id"], "resource": endpoint}) try: # Get etag of parent. parent = EVE_FETCHER.get( endpoint="data?where=%s" % json.dumps({"_id": parent_id}), token=process_file.token["access_token"], ).json() # Update parent record. return EVE_FETCHER.patch( endpoint="data_edit", item_id=str(parent["_items"][0]["_id"]), json={"children": parent["_items"][0]["children"] + new_children}, _etag=parent["_items"][0]["_etag"], token=process_file.token["access_token"], ) except RuntimeError as code_error: if not parent: log_formatted( logging.error, "Error fetching parent %s" % str(code_error), "ERROR-CELERY-GET", ) else: log_formatted( logging.error, "Error updating child list of parent: %s" % str(code_error), "ERROR-CELERY-PATCH-FAIR", ) return None
def check_for_runs(token: str) -> Tuple[List[dict], List[dict]]: """ Checks to see if any runs can start Arguments: token {str} -- JWT for API. Returns: Tuple[List[dict], List[dict]] -- Tuple, where each item is a data/assay pairing representing a run that can be started. """ try: # Only return assays that have a workflow associated with them assay_query = {"workflow_location": {"$ne": "null"}} assay_query_string = "assays?where=%s" % json.dumps(assay_query) # Contains a list of all the running assays and their inputs assay_response = EVE.get(token=token, endpoint=assay_query_string, code=200).json()["_items"] sought_mappings = [ item for sublist in [x["non_static_inputs"] for x in assay_response] for item in sublist ] query_string = "data/query?aggregate=%s" % (json.dumps( {"$inputs": sought_mappings})) record_response = EVE.get(token=token, endpoint=query_string, code=200) # Create an assay id keyed dictionary to simplify searching. assay_dict = { assay["_id"]: { "non_static_inputs": assay["non_static_inputs"], "assay_name": assay["assay_name"], "workflow_location": assay["workflow_location"], } for assay in assay_response } return record_response, assay_dict except RuntimeError as rte: try: str(query_string) error_msg = "Failed to fetch record from data collection: %s" % str( rte) except NameError: error_msg = "Failed to fetch record from assays collection: %s" % str( rte) log_formatted(logging.error, error_msg, "ERROR-CELERY-QUERY") return None
def delete_from_bucket(gs_uri: str) -> None: """ Deletes a google clouds torage object with the given path. Arguments: gs_uri {str} -- Path of file to be deleted. Returns: None -- [description] """ gsutil_args = ["gsutil", "rm", gs_uri] run_subprocess_with_logs(gsutil_args, message="Deleting Object: %s") log_formatted( logging.info, "Finished deleting object: %s" % gs_uri, "FAIR-CELERY-DELETE" )
def serialize_objectids(items: List[dict]) -> None: """ Transforms the ID strings into ObjectID objects for propper mapping. Arguments: items {List[dict]} -- List of data records. """ for record in items: record["assay"] = ObjectId(record["assay"]) record["trial"] = ObjectId(record["trial"]) record["processed"] = False record["visibility"] = True record["children"] = [] log = "Record: %s, for Trial: %s, in Assay: %s uploaded" % ( record["file_name"], str(record["trial"]), str(record["assay"]), ) log_formatted(logging.info, log, "INFO-EVE-DATA")
def postprocessing(records: List[dict]) -> None: """ Scans incoming records and sees if they need post-processing. If so, they are processed and then uploaded to mongo. Arguments: records {List[dict]} -- List of records slated for processing. """ tasks = [] for rec in records: message = "Processing: " + rec["file_name"] logging.info({"message": message, "category": "FAIR-CELERY-PROCESSING"}) for pro in PROC: if re.search(PROC[pro]["re"], rec["file_name"], re.IGNORECASE): log_formatted( logging.info, "Match found for " + rec["file_name"], "INFO-CELERY-PROCESSING", ) # If a match is found, add to job queue tasks.append(process_file.s(rec, pro)) execute_in_parallel(tasks)
def tail_logs(logs: List[str]) -> List[str]: """ Takes a list of log uris and returns their tails concat into one. Arguments: logs {List[str]} -- [description] """ log_list: List[str] = [] for i, uri in enumerate(logs): try: filename: str = "tmp%s" % str(i) gsutil_args = ["gsutil", "cp", "gs://" + uri, filename] run_subprocess_with_logs(gsutil_args, message="Copying logs for tailing") last_lines = str.join("", tail(filename, lines=50)) os.remove(filename) log_list.append(last_lines) except FileNotFoundError: log_formatted( logging.warning, "No log found for gs_uri: %s. This may be the result of a failed pipeline.", "WARNING-CELERY-SNAKEMAKE", ) return log_list
def process_file(rec: dict, pro: str) -> bool: """ Worker process that handles processing an individual file. Arguments: rec {dict} -- A record item to be processed. pro {str} -- Key in the processing dictionary that the filetype corresponds to. Returns: boolean -- True if completed without error, else false. """ temp_file_name = str(uuid4()) gs_args = ["gsutil", "cp", rec["gs_uri"], temp_file_name] subprocess.run(gs_args) records = None response = None try: # Use a random filename as tasks are executing in parallel. if PROC[pro]["mongo"]: records = PROC[pro]["func"]( temp_file_name, RecordContext( rec["trial"]["$oid"], rec["assay"]["$oid"], rec["_id"]["$oid"] ), ) else: PROC[pro]["func"]( temp_file_name, RecordContext( rec["trial"]["$oid"], rec["assay"]["$oid"], rec["_id"]["$oid"], rec ), ) remove(temp_file_name) except OSError: pass if not PROC[pro]["mongo"]: return True if not records: return False try: # First try a normal upload response = EVE_FETCHER.post( endpoint=pro, token=process_file.token["access_token"], code=201, json=records, ) update_child_list(response.json(), pro, rec["_id"]["$oid"]) # Simplify handling later if not isinstance(records, (list,)): records = [records] # Record uploads. log_record_upload(records, pro) return True except RuntimeError as rte: # Catch unexpected error codes. if "422" not in str(rte): log_formatted( logging.error, "Upload failed for reasons other than validation: %s" % str(rte), "ERROR-CELERY-UPLOAD", ) return False # Extract Cerberus errors from response. new_upload = report_validation_issues(response, records) if not new_upload: # Handle data types that haven't used the new schema yet. logging.warning( { "message": ( "Validation error detected in upload, but schema not adapted to" "handle error reports" ), "category": "WARNING-CELERY-UPLOAD", } ) return False try: # Try to upload just the errors. errors = EVE_FETCHER.post( endpoint=pro, token=process_file.token["access_token"], code=201, json=new_upload, ) # Still update parent to tie to error documents. update_child_list(errors.json(), pro, rec["_id"]["$oid"]) log_record_upload(new_upload, pro) except RuntimeError as rte: # If that fails, something on my end is wrong. log = "Upload of validation errors failed. %s" % str(rte) logging.error({"message": log, "category": "ERROR-CELERY-UPLOAD"}) return False
def process_maf(path: str, context: RecordContext) -> bool: """ Processes a new maf file to update the combined maf. Arguments: path {str} -- On disk path of the file. context {RecordContext} -- Class holding trial, assay, and a copy of the record. Returns: bool -- True if finished without error, else false. """ if context.full_record["file_name"] == "combined.maf": return True token = process_file.token["access_token"] # Check if a combined maf already exists. query_string = "data?where=%s" % json.dumps( {"trial": context.trial, "assay": context.assay, "file_name": "combined.maf"} ) extant_combined_maf = EVE_FETCHER.get(endpoint=query_string, token=token).json()[ "_items" ] if not extant_combined_maf: create_new_maf(path, context, token) return True combined_maffile = extant_combined_maf[0] maf_gs_uri = combined_maffile["gs_uri"] combined_file_name = str(uuid4()) run_subprocess_with_logs( ["gsutil", "mv", maf_gs_uri, combined_file_name], message="Copying combined maf file %s" % maf_gs_uri, ) # Combine mafs. combine_mafs(path, combined_file_name) # Overwrite old file. run_subprocess_with_logs( ["gsutil", "mv", combined_file_name, maf_gs_uri], message="Overwriting old combined.maf: %s" % maf_gs_uri, ) log_formatted( logging.info, "Overwriting old combined.maf: %s" % maf_gs_uri, "FAIR-CELERY-COMBINEDMAF", ) new_sample_ids = combined_maffile["sample_ids"] + context.full_record["sample_ids"] # Patch data to include new samples. try: EVE_FETCHER.patch( endpoint="data_edit", item_id=combined_maffile["_id"], _etag=combined_maffile["_etag"], json={ "sample_ids": new_sample_ids, "number_of_samples": len(new_sample_ids), }, token=token, ) return True except RuntimeError as rte: if "412" in str(rte): time.sleep(5) process_maf(path, context) return True log_formatted( logging.error, "Failed to edit combined.maf: %s" % str(rte), "ERROR-CELERY-PATCH", ) return False
def create_new_maf(path: str, context: RecordContext, token: str) -> bool: """ Create a new combined MAF. Arguments: path {str} -- path to file context {RecordContext} -- Object holding the record. Returns: bool -- True if succeeds, else false. """ new_record = context.full_record # Check for other analyses that finished recently. an_query = "analysis?where=%s" % json.dumps( {"trial": context.trial, "assay": context.assay, "status": "Completed"} ) analysis_runs = EVE_FETCHER.get(endpoint=an_query, token=token).json()["_items"] isotime = datetime.datetime.now(datetime.timezone.utc) my_end = 0 my_run_id = None for run in analysis_runs: for output in run["files_generated"]: if output["file_name"] == new_record["file_name"]: my_end = parser.parse(run["end_date"]) my_run_id = run["_id"] # If the time is close enough that the trial might not have cleaned up yet, # wait, then retry. for run in analysis_runs: end = parser.parse(run["end_date"]) delta = (isotime - end).total_seconds() if run["_id"] != my_run_id and delta < 30: log_formatted( logging.info, "Another run ended recently, running checks to avoid race condition.", "INFO-CELERY-MAF", ) if ( my_end > end or my_end == end and min(my_run_id, run["_id"]) == my_run_id ): log_formatted( logging.info, "Detecting an older finished run. Sleeping", "INFO-CELERY-SLEEP", ) time.sleep(5) process_maf(path, context) return True # Rename and create a new data entry. reformat_maf(new_record, context) # Generate new alias for combined maf. new_record["gs_uri"] = ( new_record["gs_uri"].replace(new_record["file_name"], "") + "/combined.maf" ) run_subprocess_with_logs( ["gsutil", "mv", path, new_record["gs_uri"]], message="Copying new combined.maf to google bucket: %s" % new_record["gs_uri"], ) try: remove(path) except FileNotFoundError: pass log_formatted( logging.info, "Copying for new combined.maf: %s complete." % new_record["gs_uri"], "FAIR-CELERY-NEWCOMBINEDMAF", ) try: EVE_FETCHER.post(endpoint="data_edit", token=token, json=new_record, code=201) return True except RuntimeError as rte: log_formatted( logging.error, "Failed to create new combined.maf: %s" % str(rte), "ERROR-CELERY-POST", ) return False
def execute_workflow(valid_run: Tuple[dict, dict]): """ Create inputs, run snakemake, report results. Arguments: valid_run {Tuple[dict, dict]} -- Tuple of (aggregation result, assay info) """ # Check that all records are unprocessed. token = execute_workflow.token["access_token"] records, all_free = check_processed(valid_run[0]["records"], token) analysis_response = register_analysis(valid_run, token) if not all_free: logging.error({ "message": "Some input files are processed, check celery logic!", "category": "ERROR-CELERY-SNAKEMAKE", }) return False logging.info({ "message": "Setting files to processed", "category": "INFO-CELERY-SNAKEMAKE" }) set_record_processed(records, True, token) # reference shortcuts aggregation_res = valid_run[0]["_id"] cimac_sample_id: str = aggregation_res["sample_ids"][0] run_id: str = str(analysis_response["_id"]) snakemake_file = clone_snakemake(valid_run[1]["workflow_location"], run_id) analysis_response["files_used"] = create_input_json( valid_run[0]["records"], run_id, cimac_sample_id) analysis_response["_etag"] = add_inputs(run_id, analysis_response["files_used"], token) # Run Snakefile. try: workflow_dag, problem = run_snakefile(snakemake_file, workdir=run_id) if problem: logging.error({ "message": "Snakemake run failed!", "category": "ERROR-CELERY-SNAKEMAKE", }) records, all_free = check_processed(valid_run[0]["records"], token) set_record_processed(records, False, token) error_str = str(problem) logging.error({ "message": error_str, "category": "ERROR-CELERY-SNAKEMAKE" }) send_mail( "Snakemake Pipeline Failed", "The snakemake pipeline failed with message: %s" % error_str, ["*****@*****.**"], "*****@*****.**", SENDGRID_API_KEY, ) return False return True except Exception as excp: log_formatted( logging.error, "Uncaught Snakemake Exception: %s. Run has failed." % str(excp), "ERROR-CELERY-SNAKEMAKE", ) problem = str(excp) finally: update_analysis(valid_run, token, analysis_response, workflow_dag, problem=problem) return False
def update_analysis(valid_run: Tuple[dict, dict], token: str, analysis: dict, dag, problem=None) -> bool: """ Update analysis record with results when the analysis has completed. Arguments: valid_run {Tuple[dict, dict]} -- Information about the run. token {str} -- JWT analysis {dict} -- Analysis id, _etag, input files. dag {snakemake.dag.DAG} -- DAG objet from the finished run. problem {str} -- Error message, string "Error" or None. Returns: bool -- True if update completes without error, else false. """ payload = valid_run[0]["_id"] payload["workflow_location"] = valid_run[1]["workflow_location"] # Handle the case of failure to generate a DAG if dag: jobs, outputs = analyze_jobs(dag) payload["jobs"] = jobs else: payload["jobs"] = [] snake_log_info = upload_snakelogs(analysis["_id"]) payload["snakemake_logs"] = snake_log_info["log_locations"] payload["snakemake_log_tails"] = snake_log_info["log_tails"] payload["files_used"] = analysis["files_used"] payload["end_date"] = datetime.datetime.now( datetime.timezone.utc).isoformat() # If job succeeds, upload files. if outputs and not problem: payload["files_generated"] = upload_results(valid_run, outputs, token) if not payload["files_generated"]: payload["status"] = "Failed" payload[ "error_message"] = "The pipeline failed to upload the results to the database." payload.__delitem__("files_generated") if problem: payload["status"] = "Failed" if problem == "Error": payload["error_message"] = ( "One of the jobs experienced an error," + "please consult the logs for more information") else: payload["error_message"] = "Snakefile Error: %s" % str(problem) else: payload["status"] = "Completed" # some upload function. try: EVE.patch( endpoint="analysis", token=token, _etag=analysis["_etag"], item_id=analysis["_id"], json=payload, code=200, ) except RuntimeError as rte: log_formatted( logging.error, "Analysis update failed: %s.\nPayload:%s" % (str(rte), payload), "ERROR-CELERY-PATCH", ) return False return True
def upload_results(valid_run: dict, outputs: List[str], token: str) -> List[dict]: """ Upload the output files from a run. Arguments: valid_run {dict} -- Run information. outputs {List[str]} -- List of GS-uris of output files. token {str} -- JWT Returns: List[dict] -- List of inserted records. """ payload = [] bucket = storage.Client().get_bucket(GOOGLE_BUCKET_NAME) aggregation_res = valid_run[0]["_id"] for output in outputs: prefix = output.replace(GOOGLE_BUCKET_NAME + "/", "") logging.info({"message": prefix, "category": "PREFIX-EBUG"}) output_file = [item for item in bucket.list_blobs(prefix=prefix)][0] payload.append({ "data_format": get_data_format(output), "date_created": datetime.datetime.now(datetime.timezone.utc).isoformat(), "file_name": output_file.name.split("/")[-1], "uuid_alias": "unaliased", "file_size": output_file.size, "sample_ids": aggregation_res["sample_ids"], "number_of_samples": len(aggregation_res["sample_ids"]), "trial": aggregation_res["trial"], "trial_name": aggregation_res["trial_name"], "assay": aggregation_res["assay"], "gs_uri": "gs://%s/%s" % (GOOGLE_BUCKET_NAME, output_file.name), "mapping": "", "experimental_strategy": aggregation_res["experimental_strategy"], "processed": False, "visibility": True, "children": [], }) authorized_users = get_authorized_users( { "assay": aggregation_res["assay"], "trial": aggregation_res["trial"] }, token, ) manage_bucket_acl(GOOGLE_BUCKET_NAME, payload[-1]["gs_uri"], authorized_users) try: inserts = EVE.post(endpoint="data_edit", code=201, token=token, json=payload).json()["_items"] logging.info({ "message": "Records created for run outputs", "category": "FAIR-CELERY-SNAKEMAKE", }) return [{ "data_id": x["_id"], "gs_uri": y["gs_uri"], "file_name": y["file_name"], "data_format": y["data_format"], } for x, y in zip(inserts, payload)] except RuntimeError as rte: log_formatted( logging.error, "Upload of run outputs failed: %s" % str(rte), "ERROR-CELERY-UPLOAD", ) return None