Ejemplo n.º 1
0
def download_artifact(client,
                      artifacts_s3_bucket,
                      artifact_path,
                      output_dir=None):
    output_path = Path(output_dir if output_dir is not None else os.getcwd())

    response = client.list_objects(Bucket=artifacts_s3_bucket,
                                   Prefix=artifact_path)

    total_size = 0
    keys = []
    etags = []
    for obj in response.get("Contents") or []:
        key = obj["Key"]
        etag = obj["ETag"]
        dest_path = dest_file_path(key, output_path)
        if dest_path.exists():
            etag_path = etag_file_path(key, output_path)
            if etag_path.exists():
                if etag_path.read_text() != etag:
                    os.remove(etag_path)
                    os.remove(dest_path)
                else:
                    continue

        total_size += obj["Size"]
        if obj["Size"] > 0 and not key.endswith("/"):
            # Skip empty files that designate folders (required by FUSE)
            keys.append(key)
            etags.append(etag)

    downloader = transfer.S3Transfer(client, transfer.TransferConfig(),
                                     transfer.OSUtils())

    # TODO: Make download files in parallel
    with tqdm(
            total=total_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
            desc=f"Downloading artifact '{short_artifact_path(artifact_path)}'"
    ) as pbar:
        for i in range(len(keys)):
            key = keys[i]
            etag = etags[i]

            def callback(size):
                pbar.update(size)

            file_path = dest_file_path(key, output_path)
            file_path.parent.mkdir(parents=True, exist_ok=True)

            downloader.download_file(artifacts_s3_bucket,
                                     key,
                                     str(file_path),
                                     callback=callback)

            etag_path = Path(etag_file_path(key, output_path))
            etag_path.parent.mkdir(parents=True, exist_ok=True)
            etag_path.write_text(etag)
Ejemplo n.º 2
0
def upload_artifact(client, artifacts_s3_bucket, artifact_path, local_dir):
    total_size = 0
    for root, sub_dirs, files in os.walk(local_dir):
        for filename in files:
            file_path = os.path.join(root, filename)
            file_size = os.path.getsize(file_path)
            total_size += file_size

    uploader = transfer.S3Transfer(client, transfer.TransferConfig(),
                                   transfer.OSUtils())

    with tqdm(total=total_size,
              unit='B',
              unit_scale=True,
              unit_divisor=1024,
              desc=f"Uploading artifact '{short_artifact_path(artifact_path)}'"
              ) as pbar:

        def callback(size):
            pbar.update(size)

        for root, sub_dirs, files in os.walk(local_dir):
            for filename in files:
                file_path = os.path.join(root, filename)

                key = artifact_path + __remove_prefix(
                    str(file_path), str(Path(local_dir).absolute()))
                uploader.upload_file(
                    str(file_path),
                    artifacts_s3_bucket,
                    key,
                    callback=callback,
                )
Ejemplo n.º 3
0
def start_sync(s3_output_location, region, endpoint_url=None):
    """Starts intermediate folder sync which copies files from 'opt/ml/output/intermediate'
    directory to the provided s3 output location as files created or modified.
    If files are deleted it doesn't delete them from s3.

    It starts intermediate folder behavior as a daemonic process and
    only if the directory doesn't exists yet, if it does - it indicates
    that platform is taking care of syncing files to S3 and container should not interfere.

    Args:
        s3_output_location (str): name of the script or module.
        region (str): the location of the module.

    Returns:
        (multiprocessing.Process): the intermediate output sync daemonic process.
    """
    if not s3_output_location or os.path.exists(intermediate_path):
        logger.debug('Could not initialize intermediate folder sync to s3.')
        return

    # create intermediate and intermediate_tmp directories
    os.makedirs(intermediate_path)
    os.makedirs(tmp_dir_path)

    # configure unique s3 output location similar to how SageMaker platform does it
    # or link it to the local output directory
    url = urlparse(s3_output_location)
    if url.scheme == 'file':
        logger.debug('Local directory is used for output. No need to sync any intermediate output.')
        return
    elif url.scheme != 's3':
        raise ValueError("Expecting 's3' scheme, got: %s in %s" % (url.scheme, url))

    # create s3 transfer client
    client = boto3.client('s3', region, endpoint_url=endpoint_url)
    s3_transfer = s3transfer.S3Transfer(client)
    s3_uploader = {
        'transfer': s3_transfer,
        'bucket': url.netloc,
        'key_prefix': os.path.join(url.path.lstrip('/'), os.environ.get('TRAINING_JOB_NAME', ''),
                                   'output', 'intermediate'),
    }

    # Add intermediate folder to the watch list
    inotify = inotify_simple.INotify()
    watch_flags = inotify_simple.flags.CLOSE_WRITE | inotify_simple.flags.CREATE
    watchers = {}
    wd = inotify.add_watch(intermediate_path, watch_flags)
    watchers[wd] = ''

    # start subprocess to sync any files from intermediate folder to s3
    p = multiprocessing.Process(target=_watch, args=[inotify, watchers, watch_flags, s3_uploader])
    # Make the process daemonic as a safety switch to prevent training job from hanging forever
    # in case if something goes wrong and main container process exits in an unexpected way
    p.daemon = True
    p.start()
    return p
Ejemplo n.º 4
0
def read_aws_boto(client, bucket_name, bucket_path, dest_file):
    """
    read an s3 resource via normal boto interface mutliparted   
    """
    config = tfr.TransferConfig(
        multipart_threshold=2 * 1024 * 1024,
        max_concurrency=10,
        num_download_attempts=10,
    )
    transfer = tfr.S3Transfer(client, config)
    transfer.download_file(bucket_name, bucket_path, dest_file)
Ejemplo n.º 5
0
def write_aws_boto(client, bucket_name, bucket_path, source_file):
    """
    write to aws via normal boto interface multiparted
    """
    config = tfr.TransferConfig(
        multipart_threshold=2 * 1024 * 1024,
        max_concurrency=10,
        num_download_attempts=10,
    )
    transfer = tfr.S3Transfer(client, config)
    transfer.upload_file(source_file, bucket_name, bucket_path, callback=ProgressPercentage(source_file))
Ejemplo n.º 6
0
    def _upload_to_s3(self):
        """Create the backup as a file on AmazonS3

        :param argparse.namespace args: The parsed CLI arguments
        :param file handle: The BytesIO handle for the backup

        """
        parsed = parse.urlparse(self.s3uri, 's3')
        key = datetime.date.today().strftime(
            '/%Y/%m/%d/{}.{}'.format(self.args.table, self.extension))
        LOGGER.info('Uploading to s3://%s%s', parsed.netloc, key)
        client = boto3.client('s3')
        s3 = transfer.S3Transfer(client)
        s3.upload_file(self.handle.name, parsed.netloc, key)
Ejemplo n.º 7
0
 def get_s3_transfer(self):
     logger.info("Init s3 transfer {url}".format(url=self.endpoint_url))
     s3_config = transfer.TransferConfig(multipart_threshold=10 * TB,
                                         max_concurrency=10,
                                         multipart_chunksize=1 * TB,
                                         num_download_attempts=5,
                                         max_io_queue=100,
                                         io_chunksize=256 * KB,
                                         use_threads=True)
     try:
         s3_transfer = transfer.S3Transfer(self.s3_client, s3_config)
         return s3_transfer
     except Exception as e:
         raise Exception(e)
Ejemplo n.º 8
0
def zip_and_load(file, load_packages='N'):
    """
    create deployment package and send it to s3
    """
    os.chdir('lambda')
    basename = os.path.basename(file)

    file_out = os.path.splitext(basename)[0] + '.zip'
    file_base = os.path.splitext(basename)[0]

    if load_packages == 'Y':
        arc = shutil.make_archive(base_name=file_base,
                                  format='zip',
                                  root_dir=package_loc)
        # print(arc)

    with zipfile.ZipFile(file_out, mode='a') as z:
        z.write(basename)

    client = boto3.client('s3', 'eu-west-1')
    transfer = tr.S3Transfer(client)
    transfer.upload_file(file_out, bucket_name, os.path.basename(file_out))
    os.remove(file_out)
    os.chdir('..')
def start_sync(s3_output_location, region, endpoint_url=None):  # pylint: disable=inconsistent-return-statements
    """Start intermediate folder sync, which copies files from 'opt/ml/output/intermediate'
    directory to the provided s3 output location as files created or modified.
    If files are deleted, it doesn't delete them from s3.

    It starts intermediate folder behavior as a daemonic process only if the directory
    doesn't exists yet. If the directory does exist, it indicates that the platform is
    taking care of syncing files to S3 and the container should not interfere.

    Args:
        s3_output_location (str): Name of the script or module.
        region (str): The location of the module.
        endpoint_url (str): An alternative endpoint URL to connect to.

    Returns:
        (multiprocessing.Process): The intermediate output sync daemonic process.
    """
    if not s3_output_location or os.path.exists(intermediate_path):
        logger.debug("Could not initialize intermediate folder sync to s3.")
        return None

    # create intermediate and intermediate_tmp directories
    os.makedirs(intermediate_path)
    os.makedirs(tmp_dir_path)

    # configure unique s3 output location similar to how SageMaker platform does it
    # or link it to the local output directory
    url = urlparse(s3_output_location)
    if url.scheme == "file":
        logger.debug(
            "Local directory is used for output. No need to sync any intermediate output."
        )
        return None
    elif url.scheme != "s3":
        raise ValueError("Expecting 's3' scheme, got: %s in %s" %
                         (url.scheme, url))

    # create s3 transfer client
    client = boto3.client("s3", region, endpoint_url=endpoint_url)
    s3_transfer = s3transfer.S3Transfer(client)
    s3_uploader = {
        "transfer":
        s3_transfer,
        "bucket":
        url.netloc,
        "key_prefix":
        os.path.join(url.path.lstrip("/"),
                     os.environ.get("TRAINING_JOB_NAME", ""), "output",
                     "intermediate"),
    }

    # Add intermediate folder to the watch list
    inotify = inotify_simple.INotify()
    watch_flags = inotify_simple.flags.CLOSE_WRITE | inotify_simple.flags.CREATE
    watchers = {}
    wd = inotify.add_watch(intermediate_path, watch_flags)
    watchers[wd] = ""
    # start subprocess to sync any files from intermediate folder to s3
    p = multiprocessing.Process(
        target=_watch, args=[inotify, watchers, watch_flags, s3_uploader])
    # Make the process daemonic as a safety switch to prevent training job from hanging forever
    # in case if something goes wrong and main container process exits in an unexpected way
    p.daemon = True
    p.start()
    return p
Ejemplo n.º 10
0
def s3_upload(file, bucket):
    client = boto3.client('s3', 'eu-west-1')
    transfer = tr.S3Transfer(client)
    transfer.upload_file(file, bucket, file)
Ejemplo n.º 11
0
    def download_file(self, filename: str, key: str) -> None:

        s3t = transfer.S3Transfer(self.client)
        s3t.download_file(self.bucket, key, filename)
        LOGGER.info('File successfully downloaded!')
Ejemplo n.º 12
0
    def upload_file(self, filename: str, key: str) -> None:

        s3t = transfer.S3Transfer(self.client)
        s3t.upload_file(filename, self.bucket, key)
        LOGGER.info('File successfully uploaded!')