def op_info(url): try: head = s3.head_object(Bucket=url.bucket, Key=url.path) to_return = { "error": None, "size": head["ContentLength"], "content_type": head["ContentType"], "metadata": head["Metadata"], "last_modified": get_timestamp(head["LastModified"]), } except client_error as err: error_code = normalize_client_error(err) if error_code == 404: to_return = {"error": ERROR_URL_NOT_FOUND, "raise_error": err} elif error_code == 403: to_return = {"error": ERROR_URL_ACCESS_DENIED, "raise_error": err} else: to_return = {"error": error_code, "raise_error": err} return to_return
def worker(result_file_name, queue, mode, s3role): # Interpret mode, it can either be a single op or something like # info_download or info_upload which implies: # - for download: we need to return the information as well # - for upload: we need to not overwrite the file if it exists modes = mode.split("_") pre_op_info = False if len(modes) > 1: pre_op_info = True mode = modes[1] else: mode = modes[0] def op_info(url): try: head = s3.head_object(Bucket=url.bucket, Key=url.path) to_return = { "error": None, "size": head["ContentLength"], "content_type": head["ContentType"], "metadata": head["Metadata"], "last_modified": get_timestamp(head["LastModified"]), } except client_error as err: error_code = normalize_client_error(err) if error_code == 404: to_return = {"error": ERROR_URL_NOT_FOUND, "raise_error": err} elif error_code == 403: to_return = { "error": ERROR_URL_ACCESS_DENIED, "raise_error": err } else: to_return = {"error": error_code, "raise_error": err} return to_return with open(result_file_name, "w") as result_file: try: from metaflow.datatools.s3util import get_s3_client s3, client_error = get_s3_client(s3_role_arn=s3role) while True: url, idx = queue.get() if url is None: break if mode == "info": result = op_info(url) orig_error = result.get("raise_error", None) if orig_error: del result["raise_error"] with open(url.local, "w") as f: json.dump(result, f) elif mode == "download": tmp = NamedTemporaryFile(dir=".", mode="wb", delete=False) try: if url.range: resp = s3.get_object(Bucket=url.bucket, Key=url.path, Range=url.range) else: resp = s3.get_object(Bucket=url.bucket, Key=url.path) sz = resp["ContentLength"] if not url.range and sz > DOWNLOAD_FILE_THRESHOLD: # In this case, it is more efficient to use download_file as it # will download multiple parts in parallel (it does it after # multipart_threshold) s3.download_file(url.bucket, url.path, tmp.name) else: read_in_chunks(tmp, resp["Body"], sz, DOWNLOAD_MAX_CHUNK) tmp.close() os.rename(tmp.name, url.local) except client_error as err: tmp.close() os.unlink(tmp.name) error_code = normalize_client_error(err) if error_code == 404: result_file.write("%d %d\n" % (idx, -ERROR_URL_NOT_FOUND)) continue elif error_code == 403: result_file.write("%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED)) continue else: raise # TODO specific error message for out of disk space # If we need the metadata, get it and write it out if pre_op_info: with open("%s_meta" % url.local, mode="w") as f: args = {"size": resp["ContentLength"]} if resp["ContentType"]: args["content_type"] = resp["ContentType"] if resp["Metadata"] is not None: args["metadata"] = resp["Metadata"] if resp["LastModified"]: args["last_modified"] = get_timestamp( resp["LastModified"]) json.dump(args, f) # Finally, we push out the size to the result_pipe since # the size is used for verification and other purposes and # we want to avoid file operations for this simple process result_file.write("%d %d\n" % (idx, resp["ContentLength"])) else: # This is upload, if we have a pre_op, it means we do not # want to overwrite do_upload = False if pre_op_info: result_info = op_info(url) if result_info["error"] == ERROR_URL_NOT_FOUND: # We only upload if the file is not found do_upload = True else: # No pre-op so we upload do_upload = True if do_upload: extra = None if url.content_type or url.metadata: extra = {} if url.content_type: extra["ContentType"] = url.content_type if url.metadata is not None: extra["Metadata"] = url.metadata s3.upload_file(url.local, url.bucket, url.path, ExtraArgs=extra) # We indicate that the file was uploaded result_file.write("%d %d\n" % (idx, 0)) except: traceback.print_exc() sys.exit(ERROR_WORKER_EXCEPTION)