def upload_file(self, body, key=None, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=None, verify=None, checksum=None, delete=None, retries=None, retries_sleep=None, debug=None, request_kwargs=None): """Upload a single file to an item. The item will be created if it does not exist. :type body: Filepath or file-like object. :param body: File or data to be uploaded. :type key: str :param key: (optional) Remote filename. :type metadata: dict :param metadata: (optional) Metadata used to create a new item. :type headers: dict :param headers: (optional) Add additional IA-S3 headers to request. :type queue_derive: bool :param queue_derive: (optional) Set to False to prevent an item from being derived after upload. :type verify: bool :param verify: (optional) Verify local MD5 checksum matches the MD5 checksum of the file received by IAS3. :type checksum: bool :param checksum: (optional) Skip based on checksum. :type delete: bool :param delete: (optional) Delete local file after the upload has been successfully verified. :type retries: int :param retries: (optional) Number of times to retry the given request if S3 returns a 503 SlowDown error. :type retries_sleep: int :param retries_sleep: (optional) Amount of time to sleep between ``retries``. :type verbose: bool :param verbose: (optional) Print progress to stdout. :type debug: bool :param debug: (optional) Set to True to print headers to stdout, and exit without sending the upload request. Usage:: >>> import internetarchive >>> item = internetarchive.Item('identifier') >>> item.upload_file('/path/to/image.jpg', ... key='photos/image1.jpg') True """ # Set defaults. headers = {} if headers is None else headers metadata = {} if metadata is None else metadata access_key = self.session.access_key if access_key is None else access_key secret_key = self.session.secret_key if secret_key is None else secret_key queue_derive = True if queue_derive is None else queue_derive verbose = False if verbose is None else verbose verify = True if verify is None else verify delete = False if delete is None else delete # Set checksum after delete. checksum = True if delete or checksum is None else checksum retries = 0 if retries is None else retries retries_sleep = 30 if retries_sleep is None else retries_sleep debug = False if debug is None else debug request_kwargs = {} if request_kwargs is None else request_kwargs md5_sum = None if not hasattr(body, 'read'): body = open(body, 'rb') size = get_file_size(body) if not headers.get('x-archive-size-hint'): headers['x-archive-size-hint'] = size # Build IA-S3 URL. key = body.name.split('/')[-1] if key is None else key base_url = '{0.session.protocol}//s3.us.archive.org/{0.identifier}'.format(self) url = '{0}/{1}'.format( base_url, urllib.parse.quote(key.lstrip('/').encode('utf-8'))) # Skip based on checksum. if checksum: md5_sum = get_md5(body) ia_file = self.get_file(key) if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum): log.info('{f} already exists: {u}'.format(f=key, u=url)) if verbose: print(' {f} already exists, skipping.'.format(f=key)) if delete: log.info( '{f} successfully uploaded to ' 'https://archive.org/download/{i}/{f} ' 'and verified, deleting ' 'local copy'.format(i=self.identifier, f=key)) os.remove(body.name) # Return an empty response object if checksums match. # TODO: Is there a better way to handle this? return Response() # require the Content-MD5 header when delete is True. if verify or delete: if not md5_sum: md5_sum = get_md5(body) headers['Content-MD5'] = md5_sum def _build_request(): body.seek(0, os.SEEK_SET) if verbose: try: chunk_size = 1048576 expected_size = size / chunk_size + 1 chunks = chunk_generator(body, chunk_size) progress_generator = progress.bar( chunks, expected_size=expected_size, label=' uploading {f}: '.format(f=key)) data = IterableToFileAdapter(progress_generator, size) except: print(' uploading {f}'.format(f=key)) data = body else: data = body request = S3Request(method='PUT', url=url, headers=headers, data=data, metadata=metadata, access_key=access_key, secret_key=secret_key, queue_derive=queue_derive) return request if debug: return _build_request() else: try: error_msg = ('s3 is overloaded, sleeping for ' '{0} seconds and retrying. ' '{1} retries left.'.format(retries_sleep, retries)) while True: if retries > 0: if self.session.s3_is_overloaded(access_key): sleep(retries_sleep) log.info(error_msg) if verbose: print(' warning: {0}'.format(error_msg), file=sys.stderr) retries -= 1 continue request = _build_request() prepared_request = request.prepare() response = self.session.send(prepared_request, stream=True, **request_kwargs) if (response.status_code == 503) and (retries > 0): log.info(error_msg) if verbose: print(' warning: {0}'.format(error_msg), file=sys.stderr) sleep(retries_sleep) retries -= 1 continue else: if response.status_code == 503: log.info('maximum retries exceeded, upload failed.') break response.raise_for_status() log.info('uploaded {f} to {u}'.format(f=key, u=url)) if delete and response.status_code == 200: log.info( '{f} successfully uploaded to ' 'https://archive.org/download/{i}/{f} and verified, deleting ' 'local copy'.format(i=self.identifier, f=key)) os.remove(body.name) return response except HTTPError as exc: msg = get_s3_xml_text(exc.response.content) error_msg = (' error uploading {0} to {1}, ' '{2}'.format(key, self.identifier, msg)) log.error(error_msg) if verbose: print(' error uploading {0}: {1}'.format(key, msg), file=sys.stderr) # Raise HTTPError with error message. raise type(exc)(error_msg, response=exc.response, request=exc.request)
def upload_file(self, body, key=None, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=None, verify=None, checksum=None, delete=None, retries=None, retries_sleep=None, debug=None, request_kwargs=None): """Upload a single file to an item. The item will be created if it does not exist. :type body: Filepath or file-like object. :param body: File or data to be uploaded. :type key: str :param key: (optional) Remote filename. :type metadata: dict :param metadata: (optional) Metadata used to create a new item. :type headers: dict :param headers: (optional) Add additional IA-S3 headers to request. :type queue_derive: bool :param queue_derive: (optional) Set to False to prevent an item from being derived after upload. :type verify: bool :param verify: (optional) Verify local MD5 checksum matches the MD5 checksum of the file received by IAS3. :type checksum: bool :param checksum: (optional) Skip based on checksum. :type delete: bool :param delete: (optional) Delete local file after the upload has been successfully verified. :type retries: int :param retries: (optional) Number of times to retry the given request if S3 returns a 503 SlowDown error. :type retries_sleep: int :param retries_sleep: (optional) Amount of time to sleep between ``retries``. :type verbose: bool :param verbose: (optional) Print progress to stdout. :type debug: bool :param debug: (optional) Set to True to print headers to stdout, and exit without sending the upload request. Usage:: >>> import internetarchive >>> item = internetarchive.Item('identifier') >>> item.upload_file('/path/to/image.jpg', ... key='photos/image1.jpg') True """ # Set defaults. headers = {} if headers is None else headers metadata = {} if metadata is None else metadata access_key = self.session.access_key if access_key is None else access_key secret_key = self.session.secret_key if secret_key is None else secret_key queue_derive = True if queue_derive is None else queue_derive verbose = False if verbose is None else verbose verify = True if verify is None else verify delete = False if delete is None else delete # Set checksum after delete. checksum = True if delete or checksum is None else checksum retries = 0 if retries is None else retries retries_sleep = 30 if retries_sleep is None else retries_sleep debug = False if debug is None else debug request_kwargs = {} if request_kwargs is None else request_kwargs if not hasattr(body, 'read'): body = open(body, 'rb') if not metadata.get('scanner'): scanner = 'Internet Archive Python library {0}'.format(__version__) metadata['scanner'] = scanner try: body.seek(0, os.SEEK_END) size = body.tell() body.seek(0, os.SEEK_SET) except IOError: size = None if not headers.get('x-archive-size-hint'): headers['x-archive-size-hint'] = size key = body.name.split('/')[-1] if key is None else key base_url = '{protocol}//s3.us.archive.org/{identifier}'.format( protocol=self.session.protocol, identifier=self.identifier) url = '{base_url}/{key}'.format(base_url=base_url, key=key.lstrip('/')) # Skip based on checksum. md5_sum = get_md5(body) ia_file = self.get_file(key) if (checksum) and (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum): log.info('{f} already exists: {u}'.format(f=key, u=url)) if verbose: print(' {f} already exists, skipping.'.format(f=key)) if delete: log.info( '{f} successfully uploaded to https://archive.org/download/{i}/{f} ' 'and verified, deleting ' 'local copy'.format(i=self.identifier, f=key)) os.remove(body.name) # Return an empty response object if checksums match. # TODO: Is there a better way to handle this? return Response() # require the Content-MD5 header when delete is True. if verify or delete: headers['Content-MD5'] = md5_sum def _build_request(): body.seek(0, os.SEEK_SET) if verbose: try: chunk_size = 1048576 expected_size = size / chunk_size + 1 chunks = chunk_generator(body, chunk_size) progress_generator = progress.bar( chunks, expected_size=expected_size, label=' uploading {f}: '.format(f=key)) data = IterableToFileAdapter(progress_generator, size) except: print(' uploading {f}'.format(f=key)) data = body else: data = body request = S3Request(method='PUT', url=url, headers=headers, data=data, metadata=metadata, access_key=access_key, secret_key=secret_key, queue_derive=queue_derive) return request if debug: return _build_request() else: try: error_msg = ('s3 is overloaded, sleeping for ' '{0} seconds and retrying. ' '{1} retries left.'.format( retries_sleep, retries)) while True: if retries > 0: if self.session.s3_is_overloaded(access_key): sleep(retries_sleep) log.info(error_msg) if verbose: print(' warning: {0}'.format(error_msg), file=sys.stderr) retries -= 1 continue request = _build_request() prepared_request = request.prepare() response = self.session.send(prepared_request, stream=True, **request_kwargs) if (response.status_code == 503) and (retries > 0): log.info(error_msg) if verbose: print(' warning: {0}'.format(error_msg), file=sys.stderr) sleep(retries_sleep) retries -= 1 continue else: if response.status_code == 503: log.info( 'maximum retries exceeded, upload failed.') break response.raise_for_status() log.info('uploaded {f} to {u}'.format(f=key, u=url)) if delete and response.status_code == 200: log.info( '{f} successfully uploaded to ' 'https://archive.org/download/{i}/{f} and verified, deleting ' 'local copy'.format(i=self.identifier, f=key)) os.remove(body.name) return response except HTTPError as exc: error_msg = (' error uploading {0} to {1}, ' '{2}'.format(key, self.identifier, exc)) log.error(error_msg) if verbose: print(error_msg, file=sys.stderr) # Raise HTTPError with error message. raise type(exc)(error_msg)
def upload_file(self, body, key=None, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=None, verify=None, checksum=None, delete=None, retries=None, retries_sleep=None, debug=None, request_kwargs=None): """Upload a single file to an item. The item will be created if it does not exist. :type body: Filepath or file-like object. :param body: File or data to be uploaded. :type key: str :param key: (optional) Remote filename. :type metadata: dict :param metadata: (optional) Metadata used to create a new item. :type headers: dict :param headers: (optional) Add additional IA-S3 headers to request. :type queue_derive: bool :param queue_derive: (optional) Set to False to prevent an item from being derived after upload. :type verify: bool :param verify: (optional) Verify local MD5 checksum matches the MD5 checksum of the file received by IAS3. :type checksum: bool :param checksum: (optional) Skip based on checksum. :type delete: bool :param delete: (optional) Delete local file after the upload has been successfully verified. :type retries: int :param retries: (optional) Number of times to retry the given request if S3 returns a 503 SlowDown error. :type retries_sleep: int :param retries_sleep: (optional) Amount of time to sleep between ``retries``. :type verbose: bool :param verbose: (optional) Print progress to stdout. :type debug: bool :param debug: (optional) Set to True to print headers to stdout, and exit without sending the upload request. Usage:: >>> import internetarchive >>> item = internetarchive.Item('identifier') >>> item.upload_file('/path/to/image.jpg', ... key='photos/image1.jpg') True """ # Set defaults. headers = {} if headers is None else headers metadata = {} if metadata is None else metadata access_key = self.session.access_key if access_key is None else access_key secret_key = self.session.secret_key if secret_key is None else secret_key queue_derive = True if queue_derive is None else queue_derive verbose = False if verbose is None else verbose verify = True if verify is None else verify delete = False if delete is None else delete # Set checksum after delete. checksum = True if delete else checksum retries = 0 if retries is None else retries retries_sleep = 30 if retries_sleep is None else retries_sleep debug = False if debug is None else debug request_kwargs = {} if request_kwargs is None else request_kwargs if 'timeout' not in request_kwargs: request_kwargs['timeout'] = 120 md5_sum = None if not hasattr(body, 'read'): filename = body body = open(body, 'rb') else: if key: filename = key else: filename = body.name size = get_file_size(body) # Support for uploading empty files. if size == 0: headers['Content-Length'] = '0' if not headers.get('x-archive-size-hint'): headers['x-archive-size-hint'] = str(size) # Build IA-S3 URL. key = norm_filepath(filename).split('/')[-1] if key is None else key base_url = '{0.session.protocol}//s3.us.archive.org/{0.identifier}'.format(self) url = '{0}/{1}'.format( base_url, urllib.parse.quote(norm_filepath(key).lstrip('/').encode('utf-8'))) # Skip based on checksum. if checksum: md5_sum = get_md5(body) ia_file = self.get_file(key) if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum): log.info('{f} already exists: {u}'.format(f=key, u=url)) if verbose: print(' {f} already exists, skipping.'.format(f=key)) if delete: log.info( '{f} successfully uploaded to ' 'https://archive.org/download/{i}/{f} ' 'and verified, deleting ' 'local copy'.format(i=self.identifier, f=key)) body.close() os.remove(filename) # Return an empty response object if checksums match. # TODO: Is there a better way to handle this? body.close() return Response() # require the Content-MD5 header when delete is True. if verify or delete: if not md5_sum: md5_sum = get_md5(body) headers['Content-MD5'] = md5_sum def _build_request(): body.seek(0, os.SEEK_SET) if verbose: try: # hack to raise exception so we get some output for # empty files. if size == 0: raise Exception chunk_size = 1048576 expected_size = size / chunk_size + 1 chunks = chunk_generator(body, chunk_size) progress_generator = progress.bar( chunks, expected_size=expected_size, label=' uploading {f}: '.format(f=key)) data = IterableToFileAdapter(progress_generator, size) except: print(' uploading {f}'.format(f=key)) data = body else: data = body headers.update(self.session.headers) request = S3Request(method='PUT', url=url, headers=headers, data=data, metadata=metadata, access_key=access_key, secret_key=secret_key, queue_derive=queue_derive) return request if debug: prepared_request = self.session.prepare_request(_build_request()) body.close() return prepared_request else: try: error_msg = ('s3 is overloaded, sleeping for ' '{0} seconds and retrying. ' '{1} retries left.'.format(retries_sleep, retries)) while True: if retries > 0: if self.session.s3_is_overloaded(access_key): sleep(retries_sleep) log.info(error_msg) if verbose: print(' warning: {0}'.format(error_msg), file=sys.stderr) retries -= 1 continue request = _build_request() prepared_request = request.prepare() # chunked transfer-encoding is NOT supported by IA-S3. # It should NEVER be set. Requests adds it in certain # scenarios (e.g. if content-length is 0). Stop it. if prepared_request.headers.get('transfer-encoding') == 'chunked': del prepared_request.headers['transfer-encoding'] response = self.session.send(prepared_request, stream=True, **request_kwargs) if (response.status_code == 503) and (retries > 0): log.info(error_msg) if verbose: print(' warning: {0}'.format(error_msg), file=sys.stderr) sleep(retries_sleep) retries -= 1 continue else: if response.status_code == 503: log.info('maximum retries exceeded, upload failed.') break response.raise_for_status() log.info(u'uploaded {f} to {u}'.format(f=key, u=url)) if delete and response.status_code == 200: log.info( '{f} successfully uploaded to ' 'https://archive.org/download/{i}/{f} and verified, deleting ' 'local copy'.format(i=self.identifier, f=key)) body.close() os.remove(filename) body.close() return response except HTTPError as exc: body.close() msg = get_s3_xml_text(exc.response.content) error_msg = (' error uploading {0} to {1}, ' '{2}'.format(key, self.identifier, msg)) log.error(error_msg) if verbose: print(' error uploading {0}: {1}'.format(key, msg), file=sys.stderr) # Raise HTTPError with error message. raise type(exc)(error_msg, response=exc.response, request=exc.request)
def upload_file( self, body, key=None, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=None, verify=None, checksum=None, delete=None, retries=None, retries_sleep=None, debug=None, request_kwargs=None, ): """Upload a single file to an item. The item will be created if it does not exist. :type body: Filepath or file-like object. :param body: File or data to be uploaded. :type key: str :param key: (optional) Remote filename. :type metadata: dict :param metadata: (optional) Metadata used to create a new item. :type headers: dict :param headers: (optional) Add additional IA-S3 headers to request. :type queue_derive: bool :param queue_derive: (optional) Set to False to prevent an item from being derived after upload. :type verify: bool :param verify: (optional) Verify local MD5 checksum matches the MD5 checksum of the file received by IAS3. :type checksum: bool :param checksum: (optional) Skip based on checksum. :type delete: bool :param delete: (optional) Delete local file after the upload has been successfully verified. :type retries: int :param retries: (optional) Number of times to retry the given request if S3 returns a 503 SlowDown error. :type retries_sleep: int :param retries_sleep: (optional) Amount of time to sleep between ``retries``. :type verbose: bool :param verbose: (optional) Print progress to stdout. :type debug: bool :param debug: (optional) Set to True to print headers to stdout, and exit without sending the upload request. Usage:: >>> import internetarchive >>> item = internetarchive.Item('identifier') >>> item.upload_file('/path/to/image.jpg', ... key='photos/image1.jpg') True """ # Set defaults. headers = {} if headers is None else headers metadata = {} if metadata is None else metadata access_key = self.session.access_key if access_key is None else access_key secret_key = self.session.secret_key if secret_key is None else secret_key queue_derive = True if queue_derive is None else queue_derive verbose = False if verbose is None else verbose verify = True if verify is None else verify delete = False if delete is None else delete # Set checksum after delete. checksum = True if delete or checksum is None else checksum retries = 0 if retries is None else retries retries_sleep = 30 if retries_sleep is None else retries_sleep debug = False if debug is None else debug request_kwargs = {} if request_kwargs is None else request_kwargs if not hasattr(body, "read"): with open(body, "rb") as f: body = BytesIO(f.read()) filename = f.name else: filename = body.name if not metadata.get("scanner"): scanner = "Internet Archive Python library {0}".format(__version__) metadata["scanner"] = scanner try: body.seek(0, os.SEEK_END) size = body.tell() body.seek(0, os.SEEK_SET) except IOError: size = None if not headers.get("x-archive-size-hint"): headers["x-archive-size-hint"] = size key = filename.split("/")[-1] if key is None else key base_url = "{protocol}//s3.us.archive.org/{identifier}".format( protocol=self.session.protocol, identifier=self.identifier ) url = "{base_url}/{key}".format(base_url=base_url, key=key.lstrip("/")) # Skip based on checksum. md5_sum = get_md5(body) ia_file = self.get_file(key) if (checksum) and (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum): log.info("{f} already exists: {u}".format(f=key, u=url)) if verbose: print(" {f} already exists, skipping.".format(f=key)) if delete: log.info( "{f} successfully uploaded to https://archive.org/download/{i}/{f} " "and verified, deleting " "local copy".format(i=self.identifier, f=key) ) os.remove(filename) # Return an empty response object if checksums match. # TODO: Is there a better way to handle this? return Response() # require the Content-MD5 header when delete is True. if verify or delete: headers["Content-MD5"] = md5_sum def _build_request(): body.seek(0, os.SEEK_SET) if verbose: try: chunk_size = 1048576 expected_size = size / chunk_size + 1 chunks = chunk_generator(body, chunk_size) progress_generator = progress.bar( chunks, expected_size=expected_size, label=" uploading {f}: ".format(f=key) ) data = IterableToFileAdapter(progress_generator, size) except: print(" uploading {f}".format(f=key)) data = body else: data = body request = S3Request( method="PUT", url=url, headers=headers, data=data, metadata=metadata, access_key=access_key, secret_key=secret_key, queue_derive=queue_derive, ) return request if debug: return _build_request() else: try: error_msg = ( "s3 is overloaded, sleeping for " "{0} seconds and retrying. " "{1} retries left.".format(retries_sleep, retries) ) while True: if retries > 0: if self.session.s3_is_overloaded(access_key): sleep(retries_sleep) log.info(error_msg) if verbose: print(" warning: {0}".format(error_msg), file=sys.stderr) retries -= 1 continue request = _build_request() prepared_request = request.prepare() response = self.session.send(prepared_request, stream=True, **request_kwargs) if (response.status_code == 503) and (retries > 0): log.info(error_msg) if verbose: print(" warning: {0}".format(error_msg), file=sys.stderr) sleep(retries_sleep) retries -= 1 continue else: if response.status_code == 503: log.info("maximum retries exceeded, upload failed.") break response.raise_for_status() log.info("uploaded {f} to {u}".format(f=key, u=url)) if delete and response.status_code == 200: log.info( "{f} successfully uploaded to " "https://archive.org/download/{i}/{f} and verified, deleting " "local copy".format(i=self.identifier, f=key) ) os.remove(filename) return response except HTTPError as exc: error_msg = " error uploading {0} to {1}, " "{2}".format(key, self.identifier, exc) log.error(error_msg) if verbose: print(error_msg, file=sys.stderr) # Raise HTTPError with error message. raise type(exc)(error_msg)
def download(self, file_path=None, verbose=None, silent=None, ignore_existing=None, checksum=None, destdir=None, retries=None, ignore_errors=None, fileobj=None, return_responses=None, no_change_timestamp=None, params=None): """Download the file into the current working directory. :type file_path: str :param file_path: Download file to the given file_path. :type verbose: bool :param verbose: (optional) Turn on verbose output. :type silent: bool :param silent: (optional) Suppress all output. :type ignore_existing: bool :param ignore_existing: Overwrite local files if they already exist. :type checksum: bool :param checksum: (optional) Skip downloading file based on checksum. :type destdir: str :param destdir: (optional) The directory to download files to. :type retries: int :param retries: (optional) The number of times to retry on failed requests. :type ignore_errors: bool :param ignore_errors: (optional) Don't fail if a single file fails to download, continue to download other files. :type fileobj: file-like object :param fileobj: (optional) Write data to the given file-like object (e.g. sys.stdout). :type return_responses: bool :param return_responses: (optional) Rather than downloading files to disk, return a list of response objects. :type no_change_timestamp: bool :param no_change_timestamp: (optional) If True, leave the time stamp as the current time instead of changing it to that given in the original archive. :type params: dict :param params: (optional) URL parameters to send with download request (e.g. `cnt=0`). :rtype: bool :returns: True if file was successfully downloaded. """ verbose = False if verbose is None else verbose ignore_existing = False if ignore_existing is None else ignore_existing checksum = False if checksum is None else checksum retries = 2 if not retries else retries ignore_errors = False if not ignore_errors else ignore_errors return_responses = False if not return_responses else return_responses no_change_timestamp = False if not no_change_timestamp else no_change_timestamp params = None if not params else params if (fileobj and silent is None) or silent is not False: silent = True else: silent = False self.item.session.mount_http_adapter(max_retries=retries) file_path = self.name if not file_path else file_path if destdir: if not os.path.exists(destdir) and return_responses is not True: os.mkdir(destdir) if os.path.isfile(destdir): raise IOError('{} is not a directory!'.format(destdir)) file_path = os.path.join(destdir, file_path) if not return_responses and os.path.exists(file_path.encode('utf-8')): if ignore_existing: msg = 'skipping {0}, file already exists.'.format(file_path) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return elif checksum: with open(file_path, 'rb') as fp: md5_sum = utils.get_md5(fp) if md5_sum == self.md5: msg = ('skipping {0}, ' 'file already exists based on checksum.'.format( file_path)) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return else: st = os.stat(file_path.encode('utf-8')) if (st.st_mtime == self.mtime) and (st.st_size == self.size) \ or self.name.endswith('_files.xml') and st.st_size != 0: msg = ('skipping {0}, file already exists ' 'based on length and date.'.format(file_path)) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return parent_dir = os.path.dirname(file_path) if parent_dir != '' \ and not os.path.exists(parent_dir) \ and return_responses is not True: os.makedirs(parent_dir) try: response = self.item.session.get(self.url, stream=True, timeout=12, auth=self.auth, params=params) response.raise_for_status() if return_responses: return response chunk_size = 2048 if not fileobj: fileobj = open(file_path.encode('utf-8'), 'wb') with fileobj: for chunk in response.iter_content(chunk_size=chunk_size): if chunk: fileobj.write(chunk) fileobj.flush() except (RetryError, HTTPError, ConnectTimeout, ConnectionError, socket.error, ReadTimeout) as exc: msg = ('error downloading file {0}, ' 'exception raised: {1}'.format(file_path, exc)) log.error(msg) if os.path.exists(file_path): os.remove(file_path) if verbose: print(' ' + msg) elif silent is False: print('e', end='') sys.stdout.flush() if ignore_errors is True: return False else: raise exc # Set mtime with mtime from files.xml. if not no_change_timestamp: # If we want to set the timestamp to that of the original archive... try: os.utime(file_path.encode('utf-8'), (0, self.mtime)) except OSError: # Probably file-like object, e.g. sys.stdout. pass msg = 'downloaded {0}/{1} to {2}'.format(self.identifier, self.name, file_path) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('d', end='') sys.stdout.flush() return True
def download(self, file_path=None, verbose=None, silent=None, ignore_existing=None, checksum=None, destdir=None, retries=None, ignore_errors=None, fileobj=None, return_responses=None, no_change_timestamp=None, params=None): """Download the file into the current working directory. :type file_path: str :param file_path: Download file to the given file_path. :type verbose: bool :param verbose: (optional) Turn on verbose output. :type silent: bool :param silent: (optional) Suppress all output. :type ignore_existing: bool :param ignore_existing: Overwrite local files if they already exist. :type checksum: bool :param checksum: (optional) Skip downloading file based on checksum. :type destdir: str :param destdir: (optional) The directory to download files to. :type retries: int :param retries: (optional) The number of times to retry on failed requests. :type ignore_errors: bool :param ignore_errors: (optional) Don't fail if a single file fails to download, continue to download other files. :type fileobj: file-like object :param fileobj: (optional) Write data to the given file-like object (e.g. sys.stdout). :type return_responses: bool :param return_responses: (optional) Rather than downloading files to disk, return a list of response objects. :type no_change_timestamp: bool :param no_change_timestamp: (optional) If True, leave the time stamp as the current time instead of changing it to that given in the original archive. :type params: dict :param params: (optional) URL parameters to send with download request (e.g. `cnt=0`). :rtype: bool :returns: True if file was successfully downloaded. """ verbose = False if verbose is None else verbose ignore_existing = False if ignore_existing is None else ignore_existing checksum = False if checksum is None else checksum retries = 2 if not retries else retries ignore_errors = False if not ignore_errors else ignore_errors return_responses = False if not return_responses else return_responses no_change_timestamp = False if not no_change_timestamp else no_change_timestamp params = None if not params else params if (fileobj and silent is None) or silent is not False: silent = True else: silent = False self.item.session.mount_http_adapter(max_retries=retries) file_path = self.name if not file_path else file_path if destdir: if not os.path.exists(destdir) and return_responses is not True: os.mkdir(destdir) if os.path.isfile(destdir): raise IOError('{} is not a directory!'.format(destdir)) file_path = os.path.join(destdir, file_path) if not return_responses and os.path.exists(file_path.encode('utf-8')): if ignore_existing: msg = 'skipping {0}, file already exists.'.format(file_path) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return elif checksum: with open(file_path, 'rb') as fp: md5_sum = utils.get_md5(fp) if md5_sum == self.md5: msg = ('skipping {0}, ' 'file already exists based on checksum.'.format(file_path)) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return else: st = os.stat(file_path.encode('utf-8')) if (st.st_mtime == self.mtime) and (st.st_size == self.size) \ or self.name.endswith('_files.xml') and st.st_size != 0: msg = ('skipping {0}, file already exists ' 'based on length and date.'.format(file_path)) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return parent_dir = os.path.dirname(file_path) if parent_dir != '' \ and not os.path.exists(parent_dir) \ and return_responses is not True: os.makedirs(parent_dir) try: response = self.item.session.get(self.url, stream=True, timeout=12, auth=self.auth, params=params) response.raise_for_status() if return_responses: return response chunk_size = 2048 if not fileobj: fileobj = open(file_path.encode('utf-8'), 'wb') with fileobj: for chunk in response.iter_content(chunk_size=chunk_size): if chunk: fileobj.write(chunk) fileobj.flush() except (RetryError, HTTPError, ConnectTimeout, ConnectionError, socket.error, ReadTimeout) as exc: msg = ('error downloading file {0}, ' 'exception raised: {1}'.format(file_path, exc)) log.error(msg) if os.path.exists(file_path): os.remove(file_path) if verbose: print(' ' + msg) elif silent is False: print('e', end='') sys.stdout.flush() if ignore_errors is True: return False else: raise exc # Set mtime with mtime from files.xml. if not no_change_timestamp: # If we want to set the timestamp to that of the original archive... try: os.utime(file_path.encode('utf-8'), (0, self.mtime)) except OSError: # Probably file-like object, e.g. sys.stdout. pass msg = 'downloaded {0}/{1} to {2}'.format(self.identifier, self.name, file_path) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('d', end='') sys.stdout.flush() return True
def download(self, file_path=None, verbose=None, silent=None, ignore_existing=None, checksum=None, destdir=None, retries=None, ignore_errors=None): """Download the file into the current working directory. :type file_path: str :param file_path: Download file to the given file_path. :type ignore_existing: bool :param ignore_existing: Overwrite local files if they already exist. :type checksum: bool :param checksum: Skip downloading file based on checksum. """ verbose = False if verbose is None else verbose silent = False if silent is None else silent ignore_existing = False if ignore_existing is None else ignore_existing checksum = False if checksum is None else checksum retries = 2 if not retries else retries ignore_errors = False if not ignore_errors else ignore_errors self.item.session._mount_http_adapter(max_retries=retries) file_path = self.name if not file_path else file_path if destdir: if not os.path.exists(destdir): os.mkdir(destdir) if os.path.isfile(destdir): raise IOError('{} is not a directory!'.format(destdir)) file_path = os.path.join(destdir, file_path) if os.path.exists(file_path): if ignore_existing: msg = 'skipping {0}, file already exists.'.format(file_path) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return elif checksum: md5_sum = utils.get_md5(open(file_path, 'rb')) if md5_sum == self.md5: msg = ('skipping {0}, ' 'file already exists based on checksum.'.format(file_path)) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return else: st = os.stat(file_path) if (st.st_mtime == self.mtime) and (st.st_size == self.size) \ or self.name.endswith('_files.xml') and st.st_size != 0: msg = ('skipping {0}, file already exists ' 'based on length and date.'.format(file_path)) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return parent_dir = os.path.dirname(file_path) if parent_dir != '' and not os.path.exists(parent_dir): os.makedirs(parent_dir) try: response = self.item.session.get(self.url, stream=True, timeout=12) response.raise_for_status() chunk_size = 2048 with open(file_path, 'wb') as f: for chunk in response.iter_content(chunk_size=chunk_size): if chunk: f.write(chunk) f.flush() except (RetryError, HTTPError, ConnectTimeout, ConnectionError, socket.error, ReadTimeout) as exc: msg = ('error downloading file {0}, ' 'exception raised: {1}'.format(file_path, exc)) log.error(msg) if os.path.exists(file_path): os.remove(file_path) if verbose: print(' ' + msg) elif silent is False: print('e', end='') sys.stdout.flush() if ignore_errors is True: return False else: raise exc # Set mtime with mtime from files.xml. os.utime(file_path, (0, self.mtime)) msg = 'downloaded {0}/{1} to {2}'.format(self.identifier, self.name, file_path) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('d', end='') sys.stdout.flush() return True
def download(self, file_path=None, verbose=None, ignore_existing=None, checksum=None, destdir=None, retries=None, ignore_errors=None, fileobj=None, return_responses=None, no_change_timestamp=None, params=None, chunk_size=None): """Download the file into the current working directory. :type file_path: str :param file_path: Download file to the given file_path. :type verbose: bool :param verbose: (optional) Turn on verbose output. :type ignore_existing: bool :param ignore_existing: Overwrite local files if they already exist. :type checksum: bool :param checksum: (optional) Skip downloading file based on checksum. :type destdir: str :param destdir: (optional) The directory to download files to. :type retries: int :param retries: (optional) The number of times to retry on failed requests. :type ignore_errors: bool :param ignore_errors: (optional) Don't fail if a single file fails to download, continue to download other files. :type fileobj: file-like object :param fileobj: (optional) Write data to the given file-like object (e.g. sys.stdout). :type return_responses: bool :param return_responses: (optional) Rather than downloading files to disk, return a list of response objects. :type no_change_timestamp: bool :param no_change_timestamp: (optional) If True, leave the time stamp as the current time instead of changing it to that given in the original archive. :type params: dict :param params: (optional) URL parameters to send with download request (e.g. `cnt=0`). :rtype: bool :returns: True if file was successfully downloaded. """ verbose = False if verbose is None else verbose ignore_existing = False if ignore_existing is None else ignore_existing checksum = False if checksum is None else checksum retries = 2 if not retries else retries ignore_errors = False if not ignore_errors else ignore_errors return_responses = False if not return_responses else return_responses no_change_timestamp = False if not no_change_timestamp else no_change_timestamp params = None if not params else params self.item.session.mount_http_adapter(max_retries=retries) file_path = self.name if not file_path else file_path if destdir: if not os.path.exists(destdir) and return_responses is not True: os.mkdir(destdir) if os.path.isfile(destdir): raise IOError(f'{destdir} is not a directory!') file_path = os.path.join(destdir, file_path) if not return_responses and os.path.exists(file_path.encode('utf-8')): if ignore_existing: msg = f'skipping {file_path}, file already exists.' log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) return elif checksum: with open(file_path, 'rb') as fp: md5_sum = utils.get_md5(fp) if md5_sum == self.md5: msg = f'skipping {file_path}, file already exists based on checksum.' log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) return else: st = os.stat(file_path.encode('utf-8')) if (st.st_mtime == self.mtime) and (st.st_size == self.size) \ or self.name.endswith('_files.xml') and st.st_size != 0: msg = f'skipping {file_path}, file already exists based on length and date.' log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) return parent_dir = os.path.dirname(file_path) if parent_dir != '' \ and not os.path.exists(parent_dir) \ and return_responses is not True: os.makedirs(parent_dir) try: response = self.item.session.get(self.url, stream=True, timeout=12, auth=self.auth, params=params) response.raise_for_status() if return_responses: return response if verbose: total = int(response.headers.get('content-length', 0)) or None progress_bar = tqdm(desc=f' downloading {self.name}', total=total, unit='iB', unit_scale=True, unit_divisor=1024) else: progress_bar = nullcontext() if not chunk_size: chunk_size = 1048576 if not fileobj: fileobj = open(file_path.encode('utf-8'), 'wb') with fileobj, progress_bar as bar: for chunk in response.iter_content(chunk_size=chunk_size): if chunk: size = fileobj.write(chunk) if bar is not None: bar.update(size) except (RetryError, HTTPError, ConnectTimeout, ConnectionError, socket.error, ReadTimeout) as exc: msg = f'error downloading file {file_path}, exception raised: {exc}' log.error(msg) if os.path.exists(file_path): os.remove(file_path) if verbose: print(f' {msg}', file=sys.stderr) if ignore_errors: return False else: raise exc # Set mtime with mtime from files.xml. if not no_change_timestamp: # If we want to set the timestamp to that of the original archive... try: os.utime(file_path.encode('utf-8'), (0, self.mtime)) except OSError: # Probably file-like object, e.g. sys.stdout. pass msg = f'downloaded {self.identifier}/{self.name} to {file_path}' log.info(msg) return True
def download(self, file_path=None, verbose=None, silent=None, ignore_existing=None, checksum=None, destdir=None, retries=None, ignore_errors=None): """Download the file into the current working directory. :type file_path: str :param file_path: Download file to the given file_path. :type ignore_existing: bool :param ignore_existing: Overwrite local files if they already exist. :type checksum: bool :param checksum: Skip downloading file based on checksum. """ verbose = False if verbose is None else verbose silent = False if silent is None else silent ignore_existing = False if ignore_existing is None else ignore_existing checksum = False if checksum is None else checksum retries = 2 if not retries else retries ignore_errors = False if not ignore_errors else ignore_errors self.item.session._mount_http_adapter(max_retries=retries) file_path = self.name if not file_path else file_path if destdir: if not os.path.exists(destdir): os.mkdir(destdir) if os.path.isfile(destdir): raise IOError('{} is not a directory!'.format(destdir)) file_path = os.path.join(destdir, file_path) if os.path.exists(file_path): if ignore_existing: msg = 'skipping {0}, file already exists.'.format(file_path) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return elif checksum: md5_sum = utils.get_md5(open(file_path, 'rb')) if md5_sum == self.md5: msg = ('skipping {0}, ' 'file already exists based on checksum.'.format( file_path)) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return else: st = os.stat(file_path) if (st.st_mtime == self.mtime) and (st.st_size == self.size) \ or self.name.endswith('_files.xml') and st.st_size != 0: msg = ('skipping {0}, file already exists ' 'based on length and date.'.format(file_path)) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('.', end='') sys.stdout.flush() return parent_dir = os.path.dirname(file_path) if parent_dir != '' and not os.path.exists(parent_dir): os.makedirs(parent_dir) try: response = self.item.session.get(self.url, stream=True, timeout=12) response.raise_for_status() chunk_size = 2048 with open(file_path, 'wb') as f: for chunk in response.iter_content(chunk_size=chunk_size): if chunk: f.write(chunk) f.flush() except (RetryError, HTTPError, ConnectTimeout, ConnectionError, socket.error, ReadTimeout) as exc: msg = ('error downloading file {0}, ' 'exception raised: {1}'.format(file_path, exc)) log.error(msg) if os.path.exists(file_path): os.remove(file_path) if verbose: print(' ' + msg) elif silent is False: print('e', end='') sys.stdout.flush() if ignore_errors is True: return False else: raise exc # Set mtime with mtime from files.xml. os.utime(file_path, (0, self.mtime)) msg = 'downloaded {0}/{1} to {2}'.format(self.identifier, self.name, file_path) log.info(msg) if verbose: print(' ' + msg) elif silent is False: print('d', end='') sys.stdout.flush() return True
def upload_file(self, body, key=None, metadata=None, file_metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=None, verify=None, checksum=None, delete=None, retries=None, retries_sleep=None, debug=None, validate_identifier=None, request_kwargs=None): """Upload a single file to an item. The item will be created if it does not exist. :type body: Filepath or file-like object. :param body: File or data to be uploaded. :type key: str :param key: (optional) Remote filename. :type metadata: dict :param metadata: (optional) Metadata used to create a new item. :type file_metadata: dict :param file_metadata: (optional) File-level metadata to add to the files.xml entry for the file being uploaded. :type headers: dict :param headers: (optional) Add additional IA-S3 headers to request. :type queue_derive: bool :param queue_derive: (optional) Set to False to prevent an item from being derived after upload. :type verify: bool :param verify: (optional) Verify local MD5 checksum matches the MD5 checksum of the file received by IAS3. :type checksum: bool :param checksum: (optional) Skip based on checksum. :type delete: bool :param delete: (optional) Delete local file after the upload has been successfully verified. :type retries: int :param retries: (optional) Number of times to retry the given request if S3 returns a 503 SlowDown error. :type retries_sleep: int :param retries_sleep: (optional) Amount of time to sleep between ``retries``. :type verbose: bool :param verbose: (optional) Print progress to stdout. :type debug: bool :param debug: (optional) Set to True to print headers to stdout, and exit without sending the upload request. :type validate_identifier: bool :param validate_identifier: (optional) Set to True to validate the identifier before uploading the file. Usage:: >>> import internetarchive >>> item = internetarchive.Item('identifier') >>> item.upload_file('/path/to/image.jpg', ... key='photos/image1.jpg') True """ # Set defaults. headers = headers or {} metadata = metadata or {} file_metadata = file_metadata or {} access_key = access_key or self.session.access_key secret_key = secret_key or self.session.secret_key queue_derive = bool(queue_derive) verbose = bool(verbose) verify = bool(verify) delete = bool(delete) # Set checksum after delete. checksum = delete or checksum retries = retries or 0 retries_sleep = retries_sleep or 30 debug = bool(debug) validate_identifier = bool(validate_identifier) request_kwargs = request_kwargs or {} if 'timeout' not in request_kwargs: request_kwargs['timeout'] = 120 md5_sum = None _headers = headers.copy() if not hasattr(body, 'read'): filename = body body = open(body, 'rb') else: filename = key or body.name size = get_file_size(body) # Support for uploading empty files. if size == 0: _headers['Content-Length'] = '0' if not _headers.get('x-archive-size-hint'): _headers['x-archive-size-hint'] = str(size) # Build IA-S3 URL. if validate_identifier: validate_s3_identifier(self.identifier) key = norm_filepath(filename).split('/')[-1] if key is None else key base_url = f'{self.session.protocol}//s3.us.archive.org/{self.identifier}' url = f'{base_url}/{quote(norm_filepath(key).lstrip("/").encode("utf-8"))}' # Skip based on checksum. if checksum: md5_sum = get_md5(body) ia_file = self.get_file(key) if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum): log.info(f'{key} already exists: {url}') if verbose: print(f' {key} already exists, skipping.', file=sys.stderr) if delete: log.info( f'{key} successfully uploaded to ' f'https://archive.org/download/{self.identifier}/{key} ' 'and verified, deleting local copy') body.close() os.remove(filename) # Return an empty response object if checksums match. # TODO: Is there a better way to handle this? body.close() return Response() # require the Content-MD5 header when delete is True. if verify or delete: if not md5_sum: md5_sum = get_md5(body) _headers['Content-MD5'] = md5_sum def _build_request(): body.seek(0, os.SEEK_SET) if verbose: try: # hack to raise exception so we get some output for # empty files. if size == 0: raise Exception chunk_size = 1048576 expected_size = math.ceil(size / chunk_size) chunks = chunk_generator(body, chunk_size) progress_generator = tqdm(chunks, desc=f' uploading {key}', dynamic_ncols=True, total=expected_size, unit='MiB') data = IterableToFileAdapter(progress_generator, size) except: print(f' uploading {key}', file=sys.stderr) data = body else: data = body _headers.update(self.session.headers) request = S3Request(method='PUT', url=url, headers=_headers, data=data, metadata=metadata, file_metadata=file_metadata, access_key=access_key, secret_key=secret_key, queue_derive=queue_derive) return request if debug: prepared_request = self.session.prepare_request(_build_request()) body.close() return prepared_request else: try: while True: error_msg = ('s3 is overloaded, sleeping for ' f'{retries_sleep} seconds and retrying. ' f'{retries} retries left.') if retries > 0: if self.session.s3_is_overloaded( access_key=access_key): sleep(retries_sleep) log.info(error_msg) if verbose: print(f' warning: {error_msg}', file=sys.stderr) retries -= 1 continue request = _build_request() prepared_request = request.prepare() # chunked transfer-encoding is NOT supported by IA-S3. # It should NEVER be set. Requests adds it in certain # scenarios (e.g. if content-length is 0). Stop it. if prepared_request.headers.get( 'transfer-encoding') == 'chunked': del prepared_request.headers['transfer-encoding'] response = self.session.send(prepared_request, stream=True, **request_kwargs) if (response.status_code == 503) and (retries > 0): log.info(error_msg) if verbose: print(f' warning: {error_msg}', file=sys.stderr) sleep(retries_sleep) retries -= 1 continue else: if response.status_code == 503: log.info( 'maximum retries exceeded, upload failed.') break response.raise_for_status() log.info(f'uploaded {key} to {url}') if delete and response.status_code == 200: log.info( f'{key} successfully uploaded to ' f'https://archive.org/download/{self.identifier}/{key} and verified, ' 'deleting local copy') body.close() os.remove(filename) response.close() return response except HTTPError as exc: try: msg = get_s3_xml_text(exc.response.content) except ExpatError: # probably HTTP 500 error and response is invalid XML msg = ( 'IA S3 returned invalid XML ' f'(HTTP status code {exc.response.status_code}). ' 'This is a server side error which is either temporary, ' 'or requires the intervention of IA admins.') error_msg = f' error uploading {key} to {self.identifier}, {msg}' log.error(error_msg) if verbose: print(f' error uploading {key}: {msg}', file=sys.stderr) # Raise HTTPError with error message. raise type(exc)(error_msg, response=exc.response, request=exc.request) finally: body.close()