Example #1
0
    def upload_file(self, body,
                    key=None,
                    metadata=None,
                    headers=None,
                    access_key=None,
                    secret_key=None,
                    queue_derive=None,
                    verbose=None,
                    verify=None,
                    checksum=None,
                    delete=None,
                    retries=None,
                    retries_sleep=None,
                    debug=None,
                    request_kwargs=None):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = {} if headers is None else headers
        metadata = {} if metadata is None else metadata
        access_key = self.session.access_key if access_key is None else access_key
        secret_key = self.session.secret_key if secret_key is None else secret_key
        queue_derive = True if queue_derive is None else queue_derive
        verbose = False if verbose is None else verbose
        verify = True if verify is None else verify
        delete = False if delete is None else delete
        # Set checksum after delete.
        checksum = True if delete or checksum is None else checksum
        retries = 0 if retries is None else retries
        retries_sleep = 30 if retries_sleep is None else retries_sleep
        debug = False if debug is None else debug
        request_kwargs = {} if request_kwargs is None else request_kwargs
        md5_sum = None

        if not hasattr(body, 'read'):
            body = open(body, 'rb')

        size = get_file_size(body)

        if not headers.get('x-archive-size-hint'):
            headers['x-archive-size-hint'] = size

        # Build IA-S3 URL.
        key = body.name.split('/')[-1] if key is None else key
        base_url = '{0.session.protocol}//s3.us.archive.org/{0.identifier}'.format(self)
        url = '{0}/{1}'.format(
            base_url, urllib.parse.quote(key.lstrip('/').encode('utf-8')))

        # Skip based on checksum.
        if checksum:
            md5_sum = get_md5(body)
            ia_file = self.get_file(key)
            if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum):
                log.info('{f} already exists: {u}'.format(f=key, u=url))
                if verbose:
                    print(' {f} already exists, skipping.'.format(f=key))
                if delete:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} '
                        'and verified, deleting '
                        'local copy'.format(i=self.identifier,
                                            f=key))
                    os.remove(body.name)
                # Return an empty response object if checksums match.
                # TODO: Is there a better way to handle this?
                return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            if not md5_sum:
                md5_sum = get_md5(body)
            headers['Content-MD5'] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    chunk_size = 1048576
                    expected_size = size / chunk_size + 1
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = progress.bar(
                        chunks,
                        expected_size=expected_size,
                        label=' uploading {f}: '.format(f=key))
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(' uploading {f}'.format(f=key))
                    data = body
            else:
                data = body

            request = S3Request(method='PUT',
                                url=url,
                                headers=headers,
                                data=data,
                                metadata=metadata,
                                access_key=access_key,
                                secret_key=secret_key,
                                queue_derive=queue_derive)
            return request

        if debug:
            return _build_request()
        else:
            try:
                error_msg = ('s3 is overloaded, sleeping for '
                             '{0} seconds and retrying. '
                             '{1} retries left.'.format(retries_sleep, retries))
                while True:
                    if retries > 0:
                        if self.session.s3_is_overloaded(access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(' warning: {0}'.format(error_msg), file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()
                    response = self.session.send(prepared_request,
                                                 stream=True,
                                                 **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(' warning: {0}'.format(error_msg), file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info('maximum retries exceeded, upload failed.')
                        break
                response.raise_for_status()
                log.info('uploaded {f} to {u}'.format(f=key, u=url))
                if delete and response.status_code == 200:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} and verified, deleting '
                        'local copy'.format(i=self.identifier,
                                            f=key))
                    os.remove(body.name)
                return response
            except HTTPError as exc:
                msg = get_s3_xml_text(exc.response.content)
                error_msg = (' error uploading {0} to {1}, '
                             '{2}'.format(key, self.identifier, msg))
                log.error(error_msg)
                if verbose:
                    print(' error uploading {0}: {1}'.format(key, msg), file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg, response=exc.response, request=exc.request)
Example #2
0
    def upload_file(self,
                    body,
                    key=None,
                    metadata=None,
                    headers=None,
                    access_key=None,
                    secret_key=None,
                    queue_derive=None,
                    verbose=None,
                    verify=None,
                    checksum=None,
                    delete=None,
                    retries=None,
                    retries_sleep=None,
                    debug=None,
                    request_kwargs=None):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = {} if headers is None else headers
        metadata = {} if metadata is None else metadata
        access_key = self.session.access_key if access_key is None else access_key
        secret_key = self.session.secret_key if secret_key is None else secret_key
        queue_derive = True if queue_derive is None else queue_derive
        verbose = False if verbose is None else verbose
        verify = True if verify is None else verify
        delete = False if delete is None else delete
        # Set checksum after delete.
        checksum = True if delete or checksum is None else checksum
        retries = 0 if retries is None else retries
        retries_sleep = 30 if retries_sleep is None else retries_sleep
        debug = False if debug is None else debug
        request_kwargs = {} if request_kwargs is None else request_kwargs

        if not hasattr(body, 'read'):
            body = open(body, 'rb')

        if not metadata.get('scanner'):
            scanner = 'Internet Archive Python library {0}'.format(__version__)
            metadata['scanner'] = scanner

        try:
            body.seek(0, os.SEEK_END)
            size = body.tell()
            body.seek(0, os.SEEK_SET)
        except IOError:
            size = None

        if not headers.get('x-archive-size-hint'):
            headers['x-archive-size-hint'] = size

        key = body.name.split('/')[-1] if key is None else key
        base_url = '{protocol}//s3.us.archive.org/{identifier}'.format(
            protocol=self.session.protocol, identifier=self.identifier)
        url = '{base_url}/{key}'.format(base_url=base_url, key=key.lstrip('/'))

        # Skip based on checksum.
        md5_sum = get_md5(body)
        ia_file = self.get_file(key)
        if (checksum) and (not self.tasks) and (ia_file) and (ia_file.md5
                                                              == md5_sum):
            log.info('{f} already exists: {u}'.format(f=key, u=url))
            if verbose:
                print(' {f} already exists, skipping.'.format(f=key))
            if delete:
                log.info(
                    '{f} successfully uploaded to https://archive.org/download/{i}/{f} '
                    'and verified, deleting '
                    'local copy'.format(i=self.identifier, f=key))
                os.remove(body.name)
            # Return an empty response object if checksums match.
            # TODO: Is there a better way to handle this?
            return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            headers['Content-MD5'] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    chunk_size = 1048576
                    expected_size = size / chunk_size + 1
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = progress.bar(
                        chunks,
                        expected_size=expected_size,
                        label=' uploading {f}: '.format(f=key))
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(' uploading {f}'.format(f=key))
                    data = body
            else:
                data = body

            request = S3Request(method='PUT',
                                url=url,
                                headers=headers,
                                data=data,
                                metadata=metadata,
                                access_key=access_key,
                                secret_key=secret_key,
                                queue_derive=queue_derive)
            return request

        if debug:
            return _build_request()
        else:
            try:
                error_msg = ('s3 is overloaded, sleeping for '
                             '{0} seconds and retrying. '
                             '{1} retries left.'.format(
                                 retries_sleep, retries))
                while True:
                    if retries > 0:
                        if self.session.s3_is_overloaded(access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(' warning: {0}'.format(error_msg),
                                      file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()
                    response = self.session.send(prepared_request,
                                                 stream=True,
                                                 **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(' warning: {0}'.format(error_msg),
                                  file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info(
                                'maximum retries exceeded, upload failed.')
                        break
                response.raise_for_status()
                log.info('uploaded {f} to {u}'.format(f=key, u=url))
                if delete and response.status_code == 200:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} and verified, deleting '
                        'local copy'.format(i=self.identifier, f=key))
                    os.remove(body.name)
                return response
            except HTTPError as exc:
                error_msg = (' error uploading {0} to {1}, '
                             '{2}'.format(key, self.identifier, exc))
                log.error(error_msg)
                if verbose:
                    print(error_msg, file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg)
Example #3
0
    def upload_file(self, body,
                    key=None,
                    metadata=None,
                    headers=None,
                    access_key=None,
                    secret_key=None,
                    queue_derive=None,
                    verbose=None,
                    verify=None,
                    checksum=None,
                    delete=None,
                    retries=None,
                    retries_sleep=None,
                    debug=None,
                    request_kwargs=None):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = {} if headers is None else headers
        metadata = {} if metadata is None else metadata
        access_key = self.session.access_key if access_key is None else access_key
        secret_key = self.session.secret_key if secret_key is None else secret_key
        queue_derive = True if queue_derive is None else queue_derive
        verbose = False if verbose is None else verbose
        verify = True if verify is None else verify
        delete = False if delete is None else delete
        # Set checksum after delete.
        checksum = True if delete else checksum
        retries = 0 if retries is None else retries
        retries_sleep = 30 if retries_sleep is None else retries_sleep
        debug = False if debug is None else debug
        request_kwargs = {} if request_kwargs is None else request_kwargs
        if 'timeout' not in request_kwargs:
            request_kwargs['timeout'] = 120
        md5_sum = None

        if not hasattr(body, 'read'):
            filename = body
            body = open(body, 'rb')
        else:
            if key:
                filename = key
            else:
                filename = body.name

        size = get_file_size(body)

        # Support for uploading empty files.
        if size == 0:
            headers['Content-Length'] = '0'

        if not headers.get('x-archive-size-hint'):
            headers['x-archive-size-hint'] = str(size)

        # Build IA-S3 URL.
        key = norm_filepath(filename).split('/')[-1] if key is None else key
        base_url = '{0.session.protocol}//s3.us.archive.org/{0.identifier}'.format(self)
        url = '{0}/{1}'.format(
            base_url, urllib.parse.quote(norm_filepath(key).lstrip('/').encode('utf-8')))

        # Skip based on checksum.
        if checksum:
            md5_sum = get_md5(body)
            ia_file = self.get_file(key)
            if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum):
                log.info('{f} already exists: {u}'.format(f=key, u=url))
                if verbose:
                    print(' {f} already exists, skipping.'.format(f=key))
                if delete:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} '
                        'and verified, deleting '
                        'local copy'.format(i=self.identifier,
                                            f=key))
                    body.close()
                    os.remove(filename)
                # Return an empty response object if checksums match.
                # TODO: Is there a better way to handle this?
                body.close()
                return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            if not md5_sum:
                md5_sum = get_md5(body)
            headers['Content-MD5'] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    # hack to raise exception so we get some output for
                    # empty files.
                    if size == 0:
                        raise Exception

                    chunk_size = 1048576
                    expected_size = size / chunk_size + 1
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = progress.bar(
                        chunks,
                        expected_size=expected_size,
                        label=' uploading {f}: '.format(f=key))
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(' uploading {f}'.format(f=key))
                    data = body
            else:
                data = body

            headers.update(self.session.headers)
            request = S3Request(method='PUT',
                                url=url,
                                headers=headers,
                                data=data,
                                metadata=metadata,
                                access_key=access_key,
                                secret_key=secret_key,
                                queue_derive=queue_derive)
            return request

        if debug:
            prepared_request = self.session.prepare_request(_build_request())
            body.close()
            return prepared_request
        else:
            try:
                error_msg = ('s3 is overloaded, sleeping for '
                             '{0} seconds and retrying. '
                             '{1} retries left.'.format(retries_sleep, retries))
                while True:
                    if retries > 0:
                        if self.session.s3_is_overloaded(access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(' warning: {0}'.format(error_msg), file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()

                    # chunked transfer-encoding is NOT supported by IA-S3.
                    # It should NEVER be set. Requests adds it in certain
                    # scenarios (e.g. if content-length is 0). Stop it.
                    if prepared_request.headers.get('transfer-encoding') == 'chunked':
                        del prepared_request.headers['transfer-encoding']

                    response = self.session.send(prepared_request,
                                                 stream=True,
                                                 **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(' warning: {0}'.format(error_msg), file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info('maximum retries exceeded, upload failed.')
                        break
                response.raise_for_status()
                log.info(u'uploaded {f} to {u}'.format(f=key, u=url))
                if delete and response.status_code == 200:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} and verified, deleting '
                        'local copy'.format(i=self.identifier, f=key))
                    body.close()
                    os.remove(filename)
                body.close()
                return response
            except HTTPError as exc:
                body.close()
                msg = get_s3_xml_text(exc.response.content)
                error_msg = (' error uploading {0} to {1}, '
                             '{2}'.format(key, self.identifier, msg))
                log.error(error_msg)
                if verbose:
                    print(' error uploading {0}: {1}'.format(key, msg), file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg, response=exc.response, request=exc.request)
Example #4
0
    def upload_file(
        self,
        body,
        key=None,
        metadata=None,
        headers=None,
        access_key=None,
        secret_key=None,
        queue_derive=None,
        verbose=None,
        verify=None,
        checksum=None,
        delete=None,
        retries=None,
        retries_sleep=None,
        debug=None,
        request_kwargs=None,
    ):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = {} if headers is None else headers
        metadata = {} if metadata is None else metadata
        access_key = self.session.access_key if access_key is None else access_key
        secret_key = self.session.secret_key if secret_key is None else secret_key
        queue_derive = True if queue_derive is None else queue_derive
        verbose = False if verbose is None else verbose
        verify = True if verify is None else verify
        delete = False if delete is None else delete
        # Set checksum after delete.
        checksum = True if delete or checksum is None else checksum
        retries = 0 if retries is None else retries
        retries_sleep = 30 if retries_sleep is None else retries_sleep
        debug = False if debug is None else debug
        request_kwargs = {} if request_kwargs is None else request_kwargs

        if not hasattr(body, "read"):
            with open(body, "rb") as f:
                body = BytesIO(f.read())
                filename = f.name
        else:
            filename = body.name

        if not metadata.get("scanner"):
            scanner = "Internet Archive Python library {0}".format(__version__)
            metadata["scanner"] = scanner

        try:
            body.seek(0, os.SEEK_END)
            size = body.tell()
            body.seek(0, os.SEEK_SET)
        except IOError:
            size = None

        if not headers.get("x-archive-size-hint"):
            headers["x-archive-size-hint"] = size

        key = filename.split("/")[-1] if key is None else key
        base_url = "{protocol}//s3.us.archive.org/{identifier}".format(
            protocol=self.session.protocol, identifier=self.identifier
        )
        url = "{base_url}/{key}".format(base_url=base_url, key=key.lstrip("/"))

        # Skip based on checksum.
        md5_sum = get_md5(body)
        ia_file = self.get_file(key)
        if (checksum) and (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum):
            log.info("{f} already exists: {u}".format(f=key, u=url))
            if verbose:
                print(" {f} already exists, skipping.".format(f=key))
            if delete:
                log.info(
                    "{f} successfully uploaded to https://archive.org/download/{i}/{f} "
                    "and verified, deleting "
                    "local copy".format(i=self.identifier, f=key)
                )
                os.remove(filename)
            # Return an empty response object if checksums match.
            # TODO: Is there a better way to handle this?
            return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            headers["Content-MD5"] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    chunk_size = 1048576
                    expected_size = size / chunk_size + 1
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = progress.bar(
                        chunks, expected_size=expected_size, label=" uploading {f}: ".format(f=key)
                    )
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(" uploading {f}".format(f=key))
                    data = body
            else:
                data = body

            request = S3Request(
                method="PUT",
                url=url,
                headers=headers,
                data=data,
                metadata=metadata,
                access_key=access_key,
                secret_key=secret_key,
                queue_derive=queue_derive,
            )
            return request

        if debug:
            return _build_request()
        else:
            try:
                error_msg = (
                    "s3 is overloaded, sleeping for "
                    "{0} seconds and retrying. "
                    "{1} retries left.".format(retries_sleep, retries)
                )
                while True:
                    if retries > 0:
                        if self.session.s3_is_overloaded(access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(" warning: {0}".format(error_msg), file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()
                    response = self.session.send(prepared_request, stream=True, **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(" warning: {0}".format(error_msg), file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info("maximum retries exceeded, upload failed.")
                        break
                response.raise_for_status()
                log.info("uploaded {f} to {u}".format(f=key, u=url))
                if delete and response.status_code == 200:
                    log.info(
                        "{f} successfully uploaded to "
                        "https://archive.org/download/{i}/{f} and verified, deleting "
                        "local copy".format(i=self.identifier, f=key)
                    )
                    os.remove(filename)
                return response
            except HTTPError as exc:
                error_msg = " error uploading {0} to {1}, " "{2}".format(key, self.identifier, exc)
                log.error(error_msg)
                if verbose:
                    print(error_msg, file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg)
Example #5
0
    def download(self,
                 file_path=None,
                 verbose=None,
                 silent=None,
                 ignore_existing=None,
                 checksum=None,
                 destdir=None,
                 retries=None,
                 ignore_errors=None,
                 fileobj=None,
                 return_responses=None,
                 no_change_timestamp=None,
                 params=None):
        """Download the file into the current working directory.

        :type file_path: str
        :param file_path: Download file to the given file_path.

        :type verbose: bool
        :param verbose: (optional) Turn on verbose output.

        :type silent: bool
        :param silent: (optional) Suppress all output.

        :type ignore_existing: bool
        :param ignore_existing: Overwrite local files if they already
                                exist.

        :type checksum: bool
        :param checksum: (optional) Skip downloading file based on checksum.

        :type destdir: str
        :param destdir: (optional) The directory to download files to.

        :type retries: int
        :param retries: (optional) The number of times to retry on failed
                        requests.

        :type ignore_errors: bool
        :param ignore_errors: (optional) Don't fail if a single file fails to
                              download, continue to download other files.

        :type fileobj: file-like object
        :param fileobj: (optional) Write data to the given file-like object
                         (e.g. sys.stdout).

        :type return_responses: bool
        :param return_responses: (optional) Rather than downloading files to disk, return
                                 a list of response objects.

        :type no_change_timestamp: bool
        :param no_change_timestamp: (optional) If True, leave the time stamp as the
                                    current time instead of changing it to that given in
                                    the original archive.

        :type params: dict
        :param params: (optional) URL parameters to send with
                       download request (e.g. `cnt=0`).

        :rtype: bool
        :returns: True if file was successfully downloaded.
        """
        verbose = False if verbose is None else verbose
        ignore_existing = False if ignore_existing is None else ignore_existing
        checksum = False if checksum is None else checksum
        retries = 2 if not retries else retries
        ignore_errors = False if not ignore_errors else ignore_errors
        return_responses = False if not return_responses else return_responses
        no_change_timestamp = False if not no_change_timestamp else no_change_timestamp
        params = None if not params else params

        if (fileobj and silent is None) or silent is not False:
            silent = True
        else:
            silent = False

        self.item.session.mount_http_adapter(max_retries=retries)
        file_path = self.name if not file_path else file_path

        if destdir:
            if not os.path.exists(destdir) and return_responses is not True:
                os.mkdir(destdir)
            if os.path.isfile(destdir):
                raise IOError('{} is not a directory!'.format(destdir))
            file_path = os.path.join(destdir, file_path)

        if not return_responses and os.path.exists(file_path.encode('utf-8')):
            if ignore_existing:
                msg = 'skipping {0}, file already exists.'.format(file_path)
                log.info(msg)
                if verbose:
                    print(' ' + msg)
                elif silent is False:
                    print('.', end='')
                    sys.stdout.flush()
                return
            elif checksum:
                with open(file_path, 'rb') as fp:
                    md5_sum = utils.get_md5(fp)

                if md5_sum == self.md5:
                    msg = ('skipping {0}, '
                           'file already exists based on checksum.'.format(
                               file_path))
                    log.info(msg)
                    if verbose:
                        print(' ' + msg)
                    elif silent is False:
                        print('.', end='')
                        sys.stdout.flush()
                    return
            else:
                st = os.stat(file_path.encode('utf-8'))
                if (st.st_mtime == self.mtime) and (st.st_size == self.size) \
                        or self.name.endswith('_files.xml') and st.st_size != 0:
                    msg = ('skipping {0}, file already exists '
                           'based on length and date.'.format(file_path))
                    log.info(msg)
                    if verbose:
                        print(' ' + msg)
                    elif silent is False:
                        print('.', end='')
                        sys.stdout.flush()
                    return

        parent_dir = os.path.dirname(file_path)
        if parent_dir != '' \
                and not os.path.exists(parent_dir) \
                and return_responses is not True:
            os.makedirs(parent_dir)

        try:
            response = self.item.session.get(self.url,
                                             stream=True,
                                             timeout=12,
                                             auth=self.auth,
                                             params=params)
            response.raise_for_status()
            if return_responses:
                return response

            chunk_size = 2048
            if not fileobj:
                fileobj = open(file_path.encode('utf-8'), 'wb')

            with fileobj:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        fileobj.write(chunk)
                        fileobj.flush()
        except (RetryError, HTTPError, ConnectTimeout, ConnectionError,
                socket.error, ReadTimeout) as exc:
            msg = ('error downloading file {0}, '
                   'exception raised: {1}'.format(file_path, exc))
            log.error(msg)
            if os.path.exists(file_path):
                os.remove(file_path)
            if verbose:
                print(' ' + msg)
            elif silent is False:
                print('e', end='')
                sys.stdout.flush()
            if ignore_errors is True:
                return False
            else:
                raise exc

        # Set mtime with mtime from files.xml.
        if not no_change_timestamp:
            # If we want to set the timestamp to that of the original archive...
            try:
                os.utime(file_path.encode('utf-8'), (0, self.mtime))
            except OSError:
                # Probably file-like object, e.g. sys.stdout.
                pass

        msg = 'downloaded {0}/{1} to {2}'.format(self.identifier, self.name,
                                                 file_path)
        log.info(msg)
        if verbose:
            print(' ' + msg)
        elif silent is False:
            print('d', end='')
            sys.stdout.flush()
        return True
Example #6
0
    def download(self, file_path=None, verbose=None, silent=None, ignore_existing=None,
                 checksum=None, destdir=None, retries=None, ignore_errors=None,
                 fileobj=None, return_responses=None, no_change_timestamp=None,
                 params=None):
        """Download the file into the current working directory.

        :type file_path: str
        :param file_path: Download file to the given file_path.

        :type verbose: bool
        :param verbose: (optional) Turn on verbose output.

        :type silent: bool
        :param silent: (optional) Suppress all output.

        :type ignore_existing: bool
        :param ignore_existing: Overwrite local files if they already
                                exist.

        :type checksum: bool
        :param checksum: (optional) Skip downloading file based on checksum.

        :type destdir: str
        :param destdir: (optional) The directory to download files to.

        :type retries: int
        :param retries: (optional) The number of times to retry on failed
                        requests.

        :type ignore_errors: bool
        :param ignore_errors: (optional) Don't fail if a single file fails to
                              download, continue to download other files.

        :type fileobj: file-like object
        :param fileobj: (optional) Write data to the given file-like object
                         (e.g. sys.stdout).

        :type return_responses: bool
        :param return_responses: (optional) Rather than downloading files to disk, return
                                 a list of response objects.

        :type no_change_timestamp: bool
        :param no_change_timestamp: (optional) If True, leave the time stamp as the
                                    current time instead of changing it to that given in
                                    the original archive.

        :type params: dict
        :param params: (optional) URL parameters to send with
                       download request (e.g. `cnt=0`).

        :rtype: bool
        :returns: True if file was successfully downloaded.
        """
        verbose = False if verbose is None else verbose
        ignore_existing = False if ignore_existing is None else ignore_existing
        checksum = False if checksum is None else checksum
        retries = 2 if not retries else retries
        ignore_errors = False if not ignore_errors else ignore_errors
        return_responses = False if not return_responses else return_responses
        no_change_timestamp = False if not no_change_timestamp else no_change_timestamp
        params = None if not params else params

        if (fileobj and silent is None) or silent is not False:
            silent = True
        else:
            silent = False

        self.item.session.mount_http_adapter(max_retries=retries)
        file_path = self.name if not file_path else file_path

        if destdir:
            if not os.path.exists(destdir) and return_responses is not True:
                os.mkdir(destdir)
            if os.path.isfile(destdir):
                raise IOError('{} is not a directory!'.format(destdir))
            file_path = os.path.join(destdir, file_path)

        if not return_responses and os.path.exists(file_path.encode('utf-8')):
            if ignore_existing:
                msg = 'skipping {0}, file already exists.'.format(file_path)
                log.info(msg)
                if verbose:
                    print(' ' + msg)
                elif silent is False:
                    print('.', end='')
                    sys.stdout.flush()
                return
            elif checksum:
                with open(file_path, 'rb') as fp:
                    md5_sum = utils.get_md5(fp)

                if md5_sum == self.md5:
                    msg = ('skipping {0}, '
                           'file already exists based on checksum.'.format(file_path))
                    log.info(msg)
                    if verbose:
                        print(' ' + msg)
                    elif silent is False:
                        print('.', end='')
                        sys.stdout.flush()
                    return
            else:
                st = os.stat(file_path.encode('utf-8'))
                if (st.st_mtime == self.mtime) and (st.st_size == self.size) \
                        or self.name.endswith('_files.xml') and st.st_size != 0:
                    msg = ('skipping {0}, file already exists '
                           'based on length and date.'.format(file_path))
                    log.info(msg)
                    if verbose:
                        print(' ' + msg)
                    elif silent is False:
                        print('.', end='')
                        sys.stdout.flush()
                    return

        parent_dir = os.path.dirname(file_path)
        if parent_dir != '' \
                and not os.path.exists(parent_dir) \
                and return_responses is not True:
            os.makedirs(parent_dir)

        try:
            response = self.item.session.get(self.url,
                                             stream=True,
                                             timeout=12,
                                             auth=self.auth,
                                             params=params)
            response.raise_for_status()
            if return_responses:
                return response

            chunk_size = 2048
            if not fileobj:
                fileobj = open(file_path.encode('utf-8'), 'wb')

            with fileobj:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        fileobj.write(chunk)
                        fileobj.flush()
        except (RetryError, HTTPError, ConnectTimeout,
                ConnectionError, socket.error, ReadTimeout) as exc:
            msg = ('error downloading file {0}, '
                   'exception raised: {1}'.format(file_path, exc))
            log.error(msg)
            if os.path.exists(file_path):
                os.remove(file_path)
            if verbose:
                print(' ' + msg)
            elif silent is False:
                print('e', end='')
                sys.stdout.flush()
            if ignore_errors is True:
                return False
            else:
                raise exc

        # Set mtime with mtime from files.xml.
        if not no_change_timestamp:
            # If we want to set the timestamp to that of the original archive...
            try:
                os.utime(file_path.encode('utf-8'), (0, self.mtime))
            except OSError:
                # Probably file-like object, e.g. sys.stdout.
                pass

        msg = 'downloaded {0}/{1} to {2}'.format(self.identifier,
                                                 self.name,
                                                 file_path)
        log.info(msg)
        if verbose:
            print(' ' + msg)
        elif silent is False:
            print('d', end='')
            sys.stdout.flush()
        return True
Example #7
0
    def download(self, file_path=None, verbose=None, silent=None, ignore_existing=None,
                 checksum=None, destdir=None, retries=None, ignore_errors=None):
        """Download the file into the current working directory.

        :type file_path: str
        :param file_path: Download file to the given file_path.

        :type ignore_existing: bool
        :param ignore_existing: Overwrite local files if they already
                                exist.

        :type checksum: bool
        :param checksum: Skip downloading file based on checksum.

        """
        verbose = False if verbose is None else verbose
        silent = False if silent is None else silent
        ignore_existing = False if ignore_existing is None else ignore_existing
        checksum = False if checksum is None else checksum
        retries = 2 if not retries else retries
        ignore_errors = False if not ignore_errors else ignore_errors

        self.item.session._mount_http_adapter(max_retries=retries)
        file_path = self.name if not file_path else file_path

        if destdir:
            if not os.path.exists(destdir):
                os.mkdir(destdir)
            if os.path.isfile(destdir):
                raise IOError('{} is not a directory!'.format(destdir))
            file_path = os.path.join(destdir, file_path)

        if os.path.exists(file_path):
            if ignore_existing:
                msg = 'skipping {0}, file already exists.'.format(file_path)
                log.info(msg)
                if verbose:
                    print(' ' + msg)
                elif silent is False:
                    print('.', end='')
                    sys.stdout.flush()
                return
            elif checksum:
                md5_sum = utils.get_md5(open(file_path, 'rb'))
                if md5_sum == self.md5:
                    msg = ('skipping {0}, '
                           'file already exists based on checksum.'.format(file_path))
                    log.info(msg)
                    if verbose:
                        print(' ' + msg)
                    elif silent is False:
                        print('.', end='')
                        sys.stdout.flush()
                    return
            else:
                st = os.stat(file_path)
                if (st.st_mtime == self.mtime) and (st.st_size == self.size) \
                        or self.name.endswith('_files.xml') and st.st_size != 0:
                    msg = ('skipping {0}, file already exists '
                           'based on length and date.'.format(file_path))
                    log.info(msg)
                    if verbose:
                        print(' ' + msg)
                    elif silent is False:
                        print('.', end='')
                        sys.stdout.flush()
                    return

        parent_dir = os.path.dirname(file_path)
        if parent_dir != '' and not os.path.exists(parent_dir):
            os.makedirs(parent_dir)

        try:
            response = self.item.session.get(self.url, stream=True, timeout=12)
            response.raise_for_status()

            chunk_size = 2048
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        f.flush()
        except (RetryError, HTTPError, ConnectTimeout,
                ConnectionError, socket.error, ReadTimeout) as exc:
            msg = ('error downloading file {0}, '
                   'exception raised: {1}'.format(file_path, exc))
            log.error(msg)
            if os.path.exists(file_path):
                os.remove(file_path)
            if verbose:
                print(' ' + msg)
            elif silent is False:
                print('e', end='')
                sys.stdout.flush()
            if ignore_errors is True:
                return False
            else:
                raise exc

        # Set mtime with mtime from files.xml.
        os.utime(file_path, (0, self.mtime))

        msg = 'downloaded {0}/{1} to {2}'.format(self.identifier,
                                                 self.name,
                                                 file_path)
        log.info(msg)
        if verbose:
            print(' ' + msg)
        elif silent is False:
            print('d', end='')
            sys.stdout.flush()
        return True
Example #8
0
    def download(self, file_path=None, verbose=None, ignore_existing=None,
                 checksum=None, destdir=None, retries=None, ignore_errors=None,
                 fileobj=None, return_responses=None, no_change_timestamp=None,
                 params=None, chunk_size=None):
        """Download the file into the current working directory.

        :type file_path: str
        :param file_path: Download file to the given file_path.

        :type verbose: bool
        :param verbose: (optional) Turn on verbose output.

        :type ignore_existing: bool
        :param ignore_existing: Overwrite local files if they already
                                exist.

        :type checksum: bool
        :param checksum: (optional) Skip downloading file based on checksum.

        :type destdir: str
        :param destdir: (optional) The directory to download files to.

        :type retries: int
        :param retries: (optional) The number of times to retry on failed
                        requests.

        :type ignore_errors: bool
        :param ignore_errors: (optional) Don't fail if a single file fails to
                              download, continue to download other files.

        :type fileobj: file-like object
        :param fileobj: (optional) Write data to the given file-like object
                         (e.g. sys.stdout).

        :type return_responses: bool
        :param return_responses: (optional) Rather than downloading files to disk, return
                                 a list of response objects.

        :type no_change_timestamp: bool
        :param no_change_timestamp: (optional) If True, leave the time stamp as the
                                    current time instead of changing it to that given in
                                    the original archive.

        :type params: dict
        :param params: (optional) URL parameters to send with
                       download request (e.g. `cnt=0`).

        :rtype: bool
        :returns: True if file was successfully downloaded.
        """
        verbose = False if verbose is None else verbose
        ignore_existing = False if ignore_existing is None else ignore_existing
        checksum = False if checksum is None else checksum
        retries = 2 if not retries else retries
        ignore_errors = False if not ignore_errors else ignore_errors
        return_responses = False if not return_responses else return_responses
        no_change_timestamp = False if not no_change_timestamp else no_change_timestamp
        params = None if not params else params

        self.item.session.mount_http_adapter(max_retries=retries)
        file_path = self.name if not file_path else file_path

        if destdir:
            if not os.path.exists(destdir) and return_responses is not True:
                os.mkdir(destdir)
            if os.path.isfile(destdir):
                raise IOError(f'{destdir} is not a directory!')
            file_path = os.path.join(destdir, file_path)

        if not return_responses and os.path.exists(file_path.encode('utf-8')):
            if ignore_existing:
                msg = f'skipping {file_path}, file already exists.'
                log.info(msg)
                if verbose:
                    print(f' {msg}', file=sys.stderr)
                return
            elif checksum:
                with open(file_path, 'rb') as fp:
                    md5_sum = utils.get_md5(fp)

                if md5_sum == self.md5:
                    msg = f'skipping {file_path}, file already exists based on checksum.'
                    log.info(msg)
                    if verbose:
                        print(f' {msg}', file=sys.stderr)
                    return
            else:
                st = os.stat(file_path.encode('utf-8'))
                if (st.st_mtime == self.mtime) and (st.st_size == self.size) \
                        or self.name.endswith('_files.xml') and st.st_size != 0:
                    msg = f'skipping {file_path}, file already exists based on length and date.'
                    log.info(msg)
                    if verbose:
                        print(f' {msg}', file=sys.stderr)
                    return

        parent_dir = os.path.dirname(file_path)
        if parent_dir != '' \
                and not os.path.exists(parent_dir) \
                and return_responses is not True:
            os.makedirs(parent_dir)

        try:
            response = self.item.session.get(self.url,
                                             stream=True,
                                             timeout=12,
                                             auth=self.auth,
                                             params=params)
            response.raise_for_status()
            if return_responses:
                return response

            if verbose:
                total = int(response.headers.get('content-length', 0)) or None
                progress_bar = tqdm(desc=f' downloading {self.name}',
                                    total=total,
                                    unit='iB',
                                    unit_scale=True,
                                    unit_divisor=1024)
            else:
                progress_bar = nullcontext()

            if not chunk_size:
                chunk_size = 1048576
            if not fileobj:
                fileobj = open(file_path.encode('utf-8'), 'wb')

            with fileobj, progress_bar as bar:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        size = fileobj.write(chunk)
                        if bar is not None:
                            bar.update(size)
        except (RetryError, HTTPError, ConnectTimeout,
                ConnectionError, socket.error, ReadTimeout) as exc:
            msg = f'error downloading file {file_path}, exception raised: {exc}'
            log.error(msg)
            if os.path.exists(file_path):
                os.remove(file_path)
            if verbose:
                print(f' {msg}', file=sys.stderr)
            if ignore_errors:
                return False
            else:
                raise exc

        # Set mtime with mtime from files.xml.
        if not no_change_timestamp:
            # If we want to set the timestamp to that of the original archive...
            try:
                os.utime(file_path.encode('utf-8'), (0, self.mtime))
            except OSError:
                # Probably file-like object, e.g. sys.stdout.
                pass

        msg = f'downloaded {self.identifier}/{self.name} to {file_path}'
        log.info(msg)
        return True
Example #9
0
    def download(self,
                 file_path=None,
                 verbose=None,
                 silent=None,
                 ignore_existing=None,
                 checksum=None,
                 destdir=None,
                 retries=None,
                 ignore_errors=None):
        """Download the file into the current working directory.

        :type file_path: str
        :param file_path: Download file to the given file_path.

        :type ignore_existing: bool
        :param ignore_existing: Overwrite local files if they already
                                exist.

        :type checksum: bool
        :param checksum: Skip downloading file based on checksum.

        """
        verbose = False if verbose is None else verbose
        silent = False if silent is None else silent
        ignore_existing = False if ignore_existing is None else ignore_existing
        checksum = False if checksum is None else checksum
        retries = 2 if not retries else retries
        ignore_errors = False if not ignore_errors else ignore_errors

        self.item.session._mount_http_adapter(max_retries=retries)
        file_path = self.name if not file_path else file_path

        if destdir:
            if not os.path.exists(destdir):
                os.mkdir(destdir)
            if os.path.isfile(destdir):
                raise IOError('{} is not a directory!'.format(destdir))
            file_path = os.path.join(destdir, file_path)

        if os.path.exists(file_path):
            if ignore_existing:
                msg = 'skipping {0}, file already exists.'.format(file_path)
                log.info(msg)
                if verbose:
                    print(' ' + msg)
                elif silent is False:
                    print('.', end='')
                    sys.stdout.flush()
                return
            elif checksum:
                md5_sum = utils.get_md5(open(file_path, 'rb'))
                if md5_sum == self.md5:
                    msg = ('skipping {0}, '
                           'file already exists based on checksum.'.format(
                               file_path))
                    log.info(msg)
                    if verbose:
                        print(' ' + msg)
                    elif silent is False:
                        print('.', end='')
                        sys.stdout.flush()
                    return
            else:
                st = os.stat(file_path)
                if (st.st_mtime == self.mtime) and (st.st_size == self.size) \
                        or self.name.endswith('_files.xml') and st.st_size != 0:
                    msg = ('skipping {0}, file already exists '
                           'based on length and date.'.format(file_path))
                    log.info(msg)
                    if verbose:
                        print(' ' + msg)
                    elif silent is False:
                        print('.', end='')
                        sys.stdout.flush()
                    return

        parent_dir = os.path.dirname(file_path)
        if parent_dir != '' and not os.path.exists(parent_dir):
            os.makedirs(parent_dir)

        try:
            response = self.item.session.get(self.url, stream=True, timeout=12)
            response.raise_for_status()

            chunk_size = 2048
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        f.flush()
        except (RetryError, HTTPError, ConnectTimeout, ConnectionError,
                socket.error, ReadTimeout) as exc:
            msg = ('error downloading file {0}, '
                   'exception raised: {1}'.format(file_path, exc))
            log.error(msg)
            if os.path.exists(file_path):
                os.remove(file_path)
            if verbose:
                print(' ' + msg)
            elif silent is False:
                print('e', end='')
                sys.stdout.flush()
            if ignore_errors is True:
                return False
            else:
                raise exc

        # Set mtime with mtime from files.xml.
        os.utime(file_path, (0, self.mtime))

        msg = 'downloaded {0}/{1} to {2}'.format(self.identifier, self.name,
                                                 file_path)
        log.info(msg)
        if verbose:
            print(' ' + msg)
        elif silent is False:
            print('d', end='')
            sys.stdout.flush()
        return True
Example #10
0
    def upload_file(self,
                    body,
                    key=None,
                    metadata=None,
                    file_metadata=None,
                    headers=None,
                    access_key=None,
                    secret_key=None,
                    queue_derive=None,
                    verbose=None,
                    verify=None,
                    checksum=None,
                    delete=None,
                    retries=None,
                    retries_sleep=None,
                    debug=None,
                    validate_identifier=None,
                    request_kwargs=None):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type file_metadata: dict
        :param file_metadata: (optional) File-level metadata to add to
                              the files.xml entry for the file being
                              uploaded.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        :type validate_identifier: bool
        :param validate_identifier: (optional) Set to True to validate the identifier before
                                    uploading the file.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = headers or {}
        metadata = metadata or {}
        file_metadata = file_metadata or {}
        access_key = access_key or self.session.access_key
        secret_key = secret_key or self.session.secret_key
        queue_derive = bool(queue_derive)
        verbose = bool(verbose)
        verify = bool(verify)
        delete = bool(delete)
        # Set checksum after delete.
        checksum = delete or checksum
        retries = retries or 0
        retries_sleep = retries_sleep or 30
        debug = bool(debug)
        validate_identifier = bool(validate_identifier)
        request_kwargs = request_kwargs or {}
        if 'timeout' not in request_kwargs:
            request_kwargs['timeout'] = 120
        md5_sum = None

        _headers = headers.copy()

        if not hasattr(body, 'read'):
            filename = body
            body = open(body, 'rb')
        else:
            filename = key or body.name

        size = get_file_size(body)

        # Support for uploading empty files.
        if size == 0:
            _headers['Content-Length'] = '0'

        if not _headers.get('x-archive-size-hint'):
            _headers['x-archive-size-hint'] = str(size)

        # Build IA-S3 URL.
        if validate_identifier:
            validate_s3_identifier(self.identifier)
        key = norm_filepath(filename).split('/')[-1] if key is None else key
        base_url = f'{self.session.protocol}//s3.us.archive.org/{self.identifier}'
        url = f'{base_url}/{quote(norm_filepath(key).lstrip("/").encode("utf-8"))}'

        # Skip based on checksum.
        if checksum:
            md5_sum = get_md5(body)
            ia_file = self.get_file(key)
            if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum):
                log.info(f'{key} already exists: {url}')
                if verbose:
                    print(f' {key} already exists, skipping.', file=sys.stderr)
                if delete:
                    log.info(
                        f'{key} successfully uploaded to '
                        f'https://archive.org/download/{self.identifier}/{key} '
                        'and verified, deleting local copy')
                    body.close()
                    os.remove(filename)
                # Return an empty response object if checksums match.
                # TODO: Is there a better way to handle this?
                body.close()
                return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            if not md5_sum:
                md5_sum = get_md5(body)
            _headers['Content-MD5'] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    # hack to raise exception so we get some output for
                    # empty files.
                    if size == 0:
                        raise Exception

                    chunk_size = 1048576
                    expected_size = math.ceil(size / chunk_size)
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = tqdm(chunks,
                                              desc=f' uploading {key}',
                                              dynamic_ncols=True,
                                              total=expected_size,
                                              unit='MiB')
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(f' uploading {key}', file=sys.stderr)
                    data = body
            else:
                data = body

            _headers.update(self.session.headers)
            request = S3Request(method='PUT',
                                url=url,
                                headers=_headers,
                                data=data,
                                metadata=metadata,
                                file_metadata=file_metadata,
                                access_key=access_key,
                                secret_key=secret_key,
                                queue_derive=queue_derive)
            return request

        if debug:
            prepared_request = self.session.prepare_request(_build_request())
            body.close()
            return prepared_request
        else:
            try:
                while True:
                    error_msg = ('s3 is overloaded, sleeping for '
                                 f'{retries_sleep} seconds and retrying. '
                                 f'{retries} retries left.')
                    if retries > 0:
                        if self.session.s3_is_overloaded(
                                access_key=access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(f' warning: {error_msg}',
                                      file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()

                    # chunked transfer-encoding is NOT supported by IA-S3.
                    # It should NEVER be set. Requests adds it in certain
                    # scenarios (e.g. if content-length is 0). Stop it.
                    if prepared_request.headers.get(
                            'transfer-encoding') == 'chunked':
                        del prepared_request.headers['transfer-encoding']

                    response = self.session.send(prepared_request,
                                                 stream=True,
                                                 **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(f' warning: {error_msg}', file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info(
                                'maximum retries exceeded, upload failed.')
                        break
                response.raise_for_status()
                log.info(f'uploaded {key} to {url}')
                if delete and response.status_code == 200:
                    log.info(
                        f'{key} successfully uploaded to '
                        f'https://archive.org/download/{self.identifier}/{key} and verified, '
                        'deleting local copy')
                    body.close()
                    os.remove(filename)
                response.close()
                return response
            except HTTPError as exc:
                try:
                    msg = get_s3_xml_text(exc.response.content)
                except ExpatError:  # probably HTTP 500 error and response is invalid XML
                    msg = (
                        'IA S3 returned invalid XML '
                        f'(HTTP status code {exc.response.status_code}). '
                        'This is a server side error which is either temporary, '
                        'or requires the intervention of IA admins.')

                error_msg = f' error uploading {key} to {self.identifier}, {msg}'
                log.error(error_msg)
                if verbose:
                    print(f' error uploading {key}: {msg}', file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg,
                                response=exc.response,
                                request=exc.request)
            finally:
                body.close()