def download_file(client: boto3.client, bucket_name: str, kname: str, dest_file: str, do_md5_check: bool = False, md5sum: str = '', attempts: int = DOWNLOAD_ATTEMPTS) -> None: ntry = 0 while ntry < attempts: ntry += 1 try: print(f'Downloading {dest_file} | {md5sum}') client.download_file(DEFAULT_BUCKET, kname, dest_file) except ClientError: continue if not do_md5_check: break else: if check_md5(dest_file, md5sum): print(f'{dest_file} pass md5sum -> DONE!') break else: print(f'{dest_file} fail md5sum -> Trying again...') if ntry == attempts: print(f"Try attempt exceeded for s3 key: {kname}") return False return True
def run_job( job: str, s3client: client, metrics: JobMetrics, queue_url: str, receipt_handle: str, ) -> int: """Remove the directory for the job. :param job: The job file describing what needs to be run. :param s3client: S3 input bucket with input files. :return: int """ ret_val = 1 try: job_info: dict = loads(job) if "job_date" not in job_info: _LOGGER.error("ERROR: Missing job date for job, %s", job) return ret_val if "job_id" not in job_info: _LOGGER.error("ERROR: Missing job id for job, %s", job) return ret_val except JSONDecodeError as error: _LOGGER.error( "ERROR: Unable to load json information for job, %s \n\t%s", job, error, ) return ret_val job_type = job_info["job_type"] job_tag = f"{job_info['job_date']}/{job_info['job_id']}" rundir = f"{GLOBAL_VARS['JOB_PATH']}{job_tag}" inbucket = job_info["bucket_name"] # Prepare job directory and download input files makedirs(rundir, exist_ok=True) chdir(rundir) for file in job_info["input_files"]: if "https" in file: name = f"{job_tag}/{file.split('/')[-1]}" try: request.urlretrieve(file, f"{GLOBAL_VARS['JOB_PATH']}{name}") except Exception as error: # TODO: intendo 2021/05/05 - Find more specific exception _LOGGER.exception( "%s ERROR: Download failed for file, %s \n\t%s", job_tag, name, error, ) return cleanup_job(job_tag, rundir) else: try: s3client.download_file( inbucket, file, f"{GLOBAL_VARS['JOB_PATH']}{file}" ) except Exception as error: # TODO: intendo 2021/05/05 - Find more specific exception _LOGGER.exception( "%s ERROR: Download failed for file, %s \n\t%s", job_tag, file, error, ) return cleanup_job(job_tag, rundir) # Run job and record associated metrics update_status( s3client, job_tag, job_type, JOBSTATUS.RUNNING, [], ) # TODO: (Eo300) consider moving binary # command (e.g. 'apbs', 'pdb2pqr30') into SQS message if JOBTYPE.APBS.name.lower() in job_type: command = f"apbs {job_info['command_line_args']}" elif JOBTYPE.PDB2PQR.name.lower() in job_type: command = f"pdb2pqr30 {job_info['command_line_args']}" else: raise KeyError(f"Invalid job type, {job_type}") if "max_run_time" in job_info: sqs = client("sqs", region_name=GLOBAL_VARS["AWS_REGION"]) sqs.change_message_visibility( QueueUrl=queue_url, ReceiptHandle=receipt_handle, VisibilityTimeout=int(job_info["max_run_time"]), ) # Execute job binary with appropriate arguments and record metrics try: metrics.start_time = time() metrics.exit_code = execute_command( job_tag, command, f"{job_type}.stdout.txt", f"{job_type}.stderr.txt", ) metrics.end_time = time() # We need to create the {job_type}-metrics.json before we upload # the files to the S3_TOPLEVEL_BUCKET. metrics.write_metrics(job_tag, job_type, ".") except Exception as error: # TODO: intendo 2021/05/05 - Find more specific exception _LOGGER.exception( "%s ERROR: Failed to execute job: %s", job_tag, error, ) # TODO: Should this return 1 because noone else will succeed? ret_val = 1 # Upload directory contents to S3 for file in listdir("."): try: file_path = f"{job_tag}/{file}" _LOGGER.info( "%s Uploading file to output bucket, %s", job_tag, file ) s3client.upload_file( f"{GLOBAL_VARS['JOB_PATH']}{file_path}", GLOBAL_VARS["S3_TOPLEVEL_BUCKET"], f"{file_path}", ) except ClientError as error: _LOGGER.exception( "%s ERROR: Failed to upload file, %s \n\t%s", job_tag, f"{job_tag}/{file}", error, ) # ret_val = 1 # TODO: 2021/03/30, Elvis - Will need to address how we bundle output # subdirectory for PDB2PKA when used; I previous bundled it as # a compressed tarball (i.e. "{job_id}-pdb2pka_output.tar.gz") # Create list of output files input_files_no_id = [ # Remove job_id prefix from input file list "".join(name.split("/")[-1]) for name in job_info["input_files"] ] output_files = [ f"{job_tag}/{filename}" for filename in listdir(".") if filename not in input_files_no_id ] # Cleanup job directory and update status cleanup_job(job_tag, rundir) update_status( s3client, job_tag, job_type, JOBSTATUS.COMPLETE, output_files, ) return ret_val