コード例 #1
0
def download_file(client: boto3.client,
                  bucket_name: str,
                  kname: str,
                  dest_file: str,
                  do_md5_check: bool = False,
                  md5sum: str = '',
                  attempts: int = DOWNLOAD_ATTEMPTS) -> None:
    ntry = 0
    while ntry < attempts:
        ntry += 1
        try:
            print(f'Downloading {dest_file} | {md5sum}')
            client.download_file(DEFAULT_BUCKET, kname, dest_file)
        except ClientError:
            continue

        if not do_md5_check:
            break
        else:
            if check_md5(dest_file, md5sum):
                print(f'{dest_file} pass md5sum -> DONE!')
                break
            else:
                print(f'{dest_file} fail md5sum -> Trying again...')

    if ntry == attempts:
        print(f"Try attempt exceeded for s3 key: {kname}")
        return False
    return True
コード例 #2
0
def run_job(
    job: str,
    s3client: client,
    metrics: JobMetrics,
    queue_url: str,
    receipt_handle: str,
) -> int:
    """Remove the directory for the job.

    :param job:  The job file describing what needs to be run.
    :param s3client:  S3 input bucket with input files.
    :return:  int
    """
    ret_val = 1
    try:
        job_info: dict = loads(job)
        if "job_date" not in job_info:
            _LOGGER.error("ERROR: Missing job date for job, %s", job)
            return ret_val
        if "job_id" not in job_info:
            _LOGGER.error("ERROR: Missing job id for job, %s", job)
            return ret_val
    except JSONDecodeError as error:
        _LOGGER.error(
            "ERROR: Unable to load json information for job, %s \n\t%s",
            job,
            error,
        )
        return ret_val
    job_type = job_info["job_type"]
    job_tag = f"{job_info['job_date']}/{job_info['job_id']}"
    rundir = f"{GLOBAL_VARS['JOB_PATH']}{job_tag}"
    inbucket = job_info["bucket_name"]

    # Prepare job directory and download input files
    makedirs(rundir, exist_ok=True)
    chdir(rundir)

    for file in job_info["input_files"]:
        if "https" in file:
            name = f"{job_tag}/{file.split('/')[-1]}"
            try:
                request.urlretrieve(file, f"{GLOBAL_VARS['JOB_PATH']}{name}")
            except Exception as error:
                # TODO: intendo 2021/05/05 - Find more specific exception
                _LOGGER.exception(
                    "%s ERROR: Download failed for file, %s \n\t%s",
                    job_tag,
                    name,
                    error,
                )
                return cleanup_job(job_tag, rundir)
        else:
            try:
                s3client.download_file(
                    inbucket, file, f"{GLOBAL_VARS['JOB_PATH']}{file}"
                )
            except Exception as error:
                # TODO: intendo 2021/05/05 - Find more specific exception
                _LOGGER.exception(
                    "%s ERROR: Download failed for file, %s \n\t%s",
                    job_tag,
                    file,
                    error,
                )
                return cleanup_job(job_tag, rundir)

    # Run job and record associated metrics
    update_status(
        s3client,
        job_tag,
        job_type,
        JOBSTATUS.RUNNING,
        [],
    )

    # TODO: (Eo300) consider moving binary
    #       command (e.g. 'apbs', 'pdb2pqr30') into SQS message
    if JOBTYPE.APBS.name.lower() in job_type:
        command = f"apbs {job_info['command_line_args']}"
    elif JOBTYPE.PDB2PQR.name.lower() in job_type:
        command = f"pdb2pqr30 {job_info['command_line_args']}"
    else:
        raise KeyError(f"Invalid job type, {job_type}")

    if "max_run_time" in job_info:
        sqs = client("sqs", region_name=GLOBAL_VARS["AWS_REGION"])
        sqs.change_message_visibility(
            QueueUrl=queue_url,
            ReceiptHandle=receipt_handle,
            VisibilityTimeout=int(job_info["max_run_time"]),
        )

    # Execute job binary with appropriate arguments and record metrics
    try:
        metrics.start_time = time()
        metrics.exit_code = execute_command(
            job_tag,
            command,
            f"{job_type}.stdout.txt",
            f"{job_type}.stderr.txt",
        )
        metrics.end_time = time()

        # We need to create the {job_type}-metrics.json before we upload
        # the files to the S3_TOPLEVEL_BUCKET.
        metrics.write_metrics(job_tag, job_type, ".")
    except Exception as error:
        # TODO: intendo 2021/05/05 - Find more specific exception
        _LOGGER.exception(
            "%s ERROR: Failed to execute job: %s",
            job_tag,
            error,
        )
        # TODO: Should this return 1 because noone else will succeed?
        ret_val = 1

    # Upload directory contents to S3
    for file in listdir("."):
        try:
            file_path = f"{job_tag}/{file}"
            _LOGGER.info(
                "%s Uploading file to output bucket, %s", job_tag, file
            )
            s3client.upload_file(
                f"{GLOBAL_VARS['JOB_PATH']}{file_path}",
                GLOBAL_VARS["S3_TOPLEVEL_BUCKET"],
                f"{file_path}",
            )
        except ClientError as error:
            _LOGGER.exception(
                "%s ERROR: Failed to upload file, %s \n\t%s",
                job_tag,
                f"{job_tag}/{file}",
                error,
            )
        # ret_val = 1

    # TODO: 2021/03/30, Elvis - Will need to address how we bundle output
    #       subdirectory for PDB2PKA when used; I previous bundled it as
    #       a compressed tarball (i.e. "{job_id}-pdb2pka_output.tar.gz")

    # Create list of output files
    input_files_no_id = [  # Remove job_id prefix from input file list
        "".join(name.split("/")[-1]) for name in job_info["input_files"]
    ]
    output_files = [
        f"{job_tag}/{filename}"
        for filename in listdir(".")
        if filename not in input_files_no_id
    ]

    # Cleanup job directory and update status
    cleanup_job(job_tag, rundir)
    update_status(
        s3client,
        job_tag,
        job_type,
        JOBSTATUS.COMPLETE,
        output_files,
    )

    return ret_val