def _upload_files(item, files, upload_kwargs, prev_identifier=None, archive_session=None): """Helper function for calling :meth:`Item.upload`""" responses = [] if (upload_kwargs['verbose']) and (prev_identifier != item.identifier): print('{0}:'.format(item.identifier)) try: response = item.upload(files, **upload_kwargs) responses += response except HTTPError as exc: responses += [exc.response] finally: # Debug mode. if upload_kwargs['debug']: for i, r in enumerate(responses): if i != 0: print('---') headers = '\n'.join( [' {0}:{1}'.format(k, v) for (k, v) in r.headers.items()] ) print('Endpoint:\n {0}\n'.format(r.url)) print('HTTP Headers:\n{0}'.format(headers)) return responses # Format error message for any non 200 responses that # we haven't caught yet,and write to stderr. if responses and responses[-1] and responses[-1].status_code != 200: if not responses[-1].status_code: return responses filename = responses[-1].request.url.split('/')[-1] msg = get_s3_xml_text(responses[-1].content) print(' error uploading {0}: {2}'.format(filename, msg), file=sys.stderr) return responses
def upload_file(self, body, key=None, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=None, verify=None, checksum=None, delete=None, retries=None, retries_sleep=None, debug=None, request_kwargs=None): """Upload a single file to an item. The item will be created if it does not exist. :type body: Filepath or file-like object. :param body: File or data to be uploaded. :type key: str :param key: (optional) Remote filename. :type metadata: dict :param metadata: (optional) Metadata used to create a new item. :type headers: dict :param headers: (optional) Add additional IA-S3 headers to request. :type queue_derive: bool :param queue_derive: (optional) Set to False to prevent an item from being derived after upload. :type verify: bool :param verify: (optional) Verify local MD5 checksum matches the MD5 checksum of the file received by IAS3. :type checksum: bool :param checksum: (optional) Skip based on checksum. :type delete: bool :param delete: (optional) Delete local file after the upload has been successfully verified. :type retries: int :param retries: (optional) Number of times to retry the given request if S3 returns a 503 SlowDown error. :type retries_sleep: int :param retries_sleep: (optional) Amount of time to sleep between ``retries``. :type verbose: bool :param verbose: (optional) Print progress to stdout. :type debug: bool :param debug: (optional) Set to True to print headers to stdout, and exit without sending the upload request. Usage:: >>> import internetarchive >>> item = internetarchive.Item('identifier') >>> item.upload_file('/path/to/image.jpg', ... key='photos/image1.jpg') True """ # Set defaults. headers = {} if headers is None else headers metadata = {} if metadata is None else metadata access_key = self.session.access_key if access_key is None else access_key secret_key = self.session.secret_key if secret_key is None else secret_key queue_derive = True if queue_derive is None else queue_derive verbose = False if verbose is None else verbose verify = True if verify is None else verify delete = False if delete is None else delete # Set checksum after delete. checksum = True if delete or checksum is None else checksum retries = 0 if retries is None else retries retries_sleep = 30 if retries_sleep is None else retries_sleep debug = False if debug is None else debug request_kwargs = {} if request_kwargs is None else request_kwargs md5_sum = None if not hasattr(body, 'read'): body = open(body, 'rb') size = get_file_size(body) if not headers.get('x-archive-size-hint'): headers['x-archive-size-hint'] = size # Build IA-S3 URL. key = body.name.split('/')[-1] if key is None else key base_url = '{0.session.protocol}//s3.us.archive.org/{0.identifier}'.format(self) url = '{0}/{1}'.format( base_url, urllib.parse.quote(key.lstrip('/').encode('utf-8'))) # Skip based on checksum. if checksum: md5_sum = get_md5(body) ia_file = self.get_file(key) if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum): log.info('{f} already exists: {u}'.format(f=key, u=url)) if verbose: print(' {f} already exists, skipping.'.format(f=key)) if delete: log.info( '{f} successfully uploaded to ' 'https://archive.org/download/{i}/{f} ' 'and verified, deleting ' 'local copy'.format(i=self.identifier, f=key)) os.remove(body.name) # Return an empty response object if checksums match. # TODO: Is there a better way to handle this? return Response() # require the Content-MD5 header when delete is True. if verify or delete: if not md5_sum: md5_sum = get_md5(body) headers['Content-MD5'] = md5_sum def _build_request(): body.seek(0, os.SEEK_SET) if verbose: try: chunk_size = 1048576 expected_size = size / chunk_size + 1 chunks = chunk_generator(body, chunk_size) progress_generator = progress.bar( chunks, expected_size=expected_size, label=' uploading {f}: '.format(f=key)) data = IterableToFileAdapter(progress_generator, size) except: print(' uploading {f}'.format(f=key)) data = body else: data = body request = S3Request(method='PUT', url=url, headers=headers, data=data, metadata=metadata, access_key=access_key, secret_key=secret_key, queue_derive=queue_derive) return request if debug: return _build_request() else: try: error_msg = ('s3 is overloaded, sleeping for ' '{0} seconds and retrying. ' '{1} retries left.'.format(retries_sleep, retries)) while True: if retries > 0: if self.session.s3_is_overloaded(access_key): sleep(retries_sleep) log.info(error_msg) if verbose: print(' warning: {0}'.format(error_msg), file=sys.stderr) retries -= 1 continue request = _build_request() prepared_request = request.prepare() response = self.session.send(prepared_request, stream=True, **request_kwargs) if (response.status_code == 503) and (retries > 0): log.info(error_msg) if verbose: print(' warning: {0}'.format(error_msg), file=sys.stderr) sleep(retries_sleep) retries -= 1 continue else: if response.status_code == 503: log.info('maximum retries exceeded, upload failed.') break response.raise_for_status() log.info('uploaded {f} to {u}'.format(f=key, u=url)) if delete and response.status_code == 200: log.info( '{f} successfully uploaded to ' 'https://archive.org/download/{i}/{f} and verified, deleting ' 'local copy'.format(i=self.identifier, f=key)) os.remove(body.name) return response except HTTPError as exc: msg = get_s3_xml_text(exc.response.content) error_msg = (' error uploading {0} to {1}, ' '{2}'.format(key, self.identifier, msg)) log.error(error_msg) if verbose: print(' error uploading {0}: {1}'.format(key, msg), file=sys.stderr) # Raise HTTPError with error message. raise type(exc)(error_msg, response=exc.response, request=exc.request)
def main(argv, session): args = docopt(__doc__, argv=argv) # Validation error messages. invalid_id_msg = ('<identifier> should be between 3 and 80 characters in length, and ' 'can only contain alphanumeric characters, underscores ( _ ), or ' 'dashes ( - )') # Validate args. s = Schema({ six.text_type: Use(lambda x: bool(x)), '<file>': And(list, Use( lambda x: convert_str_list_to_unicode(x) if six.PY2 else x)), '--format': list, '--header': Or(None, And(Use(get_args_dict), dict), error='--header must be formatted as --header="key:value"'), '--glob': list, 'delete': bool, '--retries': Use(lambda i: int(i[0])), '<identifier>': str, }) try: args = s.validate(args) except SchemaError as exc: print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr) sys.exit(1) verbose = True if not args['--quiet'] else False item = session.get_item(args['<identifier>']) if not item.exists: print('{0}: skipping, item does\'t exist.') # Files that cannot be deleted via S3. no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite'] # Add keep-old-version by default. if 'x-archive-keep-old-version' not in args['--header']: args['--header']['x-archive-keep-old-version'] = '1' if verbose: sys.stdout.write('Deleting files from {0}\n'.format(item.identifier)) if args['--all']: files = [f for f in item.get_files()] args['--cacade'] = True elif args['--glob']: files = item.get_files(glob_pattern=args['--glob']) elif args['--format']: files = item.get_files(formats=args['--format']) else: fnames = [] if args['<file>'] == ['-']: if six.PY2: fnames = convert_str_list_to_unicode([f.strip() for f in sys.stdin]) else: fnames = [f.strip() for f in sys.stdin] else: fnames = [f.strip() for f in args['<file>']] files = list(item.get_files(fnames)) if not files: sys.stderr.write(' warning: no files found, nothing deleted.\n') sys.exit(1) errors = False for f in files: if not f: if verbose: sys.stderr.write(' error: "{0}" does not exist\n'.format(f.name)) errors = True if any(f.name.endswith(s) for s in no_delete): continue if args['--dry-run']: sys.stdout.write(' will delete: {0}/{1}\n'.format(item.identifier, f.name.encode('utf-8'))) continue try: resp = f.delete(verbose=verbose, cascade_delete=args['--cascade'], headers=args['--headers'], retries=args['--retries']) except requests.exceptions.RetryError as e: print(' error: max retries exceeded for {0}'.format(f.name), file=sys.stderr) errors = True continue if resp.status_code != 204: errors = True msg = get_s3_xml_text(resp.content) print(' error: {0} ({1})'.format(msg, resp.status_code), file=sys.stderr) continue if errors is True: sys.exit(1)
def main(argv, session): args = docopt(__doc__, argv=argv) # Validation error messages. invalid_id_msg = ('<identifier> should be between 3 and 80 characters in length, and ' 'can only contain alphanumeric characters, underscores ( _ ), or ' 'dashes ( - )') # Validate args. s = Schema({ six.text_type: Use(lambda x: bool(x)), '<file>': And(list, Use( lambda x: convert_str_list_to_unicode(x) if six.PY2 else x)), '--format': list, '--header': Or(None, And(Use(get_args_dict), dict), error='--header must be formatted as --header="key:value"'), '--glob': list, 'delete': bool, '--retries': Use(lambda i: int(i[0])), '<identifier>': str, }) try: args = s.validate(args) except SchemaError as exc: print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr) sys.exit(1) verbose = True if not args['--quiet'] else False item = session.get_item(args['<identifier>']) if not item.exists: print('{0}: skipping, item does\'t exist.') # Files that cannot be deleted via S3. no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite'] # Add keep-old-version by default. if 'x-archive-keep-old-version' not in args['--header']: args['--header']['x-archive-keep-old-version'] = '1' if verbose: sys.stdout.write('Deleting files from {0}\n'.format(item.identifier)) if args['--all']: files = [f for f in item.get_files()] args['--cacade'] = True elif args['--glob']: files = item.get_files(glob_pattern=args['--glob']) elif args['--format']: files = item.get_files(formats=args['--format']) else: fnames = [] if args['<file>'] == ['-']: if six.PY2: fnames = convert_str_list_to_unicode([f.strip() for f in sys.stdin]) else: fnames = [f.strip() for f in sys.stdin] else: fnames = [f.strip() for f in args['<file>']] files = list(item.get_files(fnames)) if not files: sys.stderr.write(' warning: no files found, nothing deleted.\n') sys.exit(1) errors = False for f in files: if not f: if verbose: sys.stderr.write(' error: "{0}" does not exist\n'.format(f.name)) errors = True if any(f.name.endswith(s) for s in no_delete): continue if args['--dry-run']: sys.stdout.write(' will delete: {0}/{1}\n'.format(item.identifier, f.name.encode('utf-8'))) continue try: resp = f.delete(verbose=verbose, cascade_delete=args['--cascade'], headers=args['--header'], retries=args['--retries']) except requests.exceptions.RetryError as e: print(' error: max retries exceeded for {0}'.format(f.name), file=sys.stderr) errors = True continue if resp.status_code != 204: errors = True msg = get_s3_xml_text(resp.content) print(' error: {0} ({1})'.format(msg, resp.status_code), file=sys.stderr) continue if errors is True: sys.exit(1)
def upload_file(self, body, key=None, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=None, verify=None, checksum=None, delete=None, retries=None, retries_sleep=None, debug=None, request_kwargs=None): """Upload a single file to an item. The item will be created if it does not exist. :type body: Filepath or file-like object. :param body: File or data to be uploaded. :type key: str :param key: (optional) Remote filename. :type metadata: dict :param metadata: (optional) Metadata used to create a new item. :type headers: dict :param headers: (optional) Add additional IA-S3 headers to request. :type queue_derive: bool :param queue_derive: (optional) Set to False to prevent an item from being derived after upload. :type verify: bool :param verify: (optional) Verify local MD5 checksum matches the MD5 checksum of the file received by IAS3. :type checksum: bool :param checksum: (optional) Skip based on checksum. :type delete: bool :param delete: (optional) Delete local file after the upload has been successfully verified. :type retries: int :param retries: (optional) Number of times to retry the given request if S3 returns a 503 SlowDown error. :type retries_sleep: int :param retries_sleep: (optional) Amount of time to sleep between ``retries``. :type verbose: bool :param verbose: (optional) Print progress to stdout. :type debug: bool :param debug: (optional) Set to True to print headers to stdout, and exit without sending the upload request. Usage:: >>> import internetarchive >>> item = internetarchive.Item('identifier') >>> item.upload_file('/path/to/image.jpg', ... key='photos/image1.jpg') True """ # Set defaults. headers = {} if headers is None else headers metadata = {} if metadata is None else metadata access_key = self.session.access_key if access_key is None else access_key secret_key = self.session.secret_key if secret_key is None else secret_key queue_derive = True if queue_derive is None else queue_derive verbose = False if verbose is None else verbose verify = True if verify is None else verify delete = False if delete is None else delete # Set checksum after delete. checksum = True if delete else checksum retries = 0 if retries is None else retries retries_sleep = 30 if retries_sleep is None else retries_sleep debug = False if debug is None else debug request_kwargs = {} if request_kwargs is None else request_kwargs if 'timeout' not in request_kwargs: request_kwargs['timeout'] = 120 md5_sum = None if not hasattr(body, 'read'): filename = body body = open(body, 'rb') else: if key: filename = key else: filename = body.name size = get_file_size(body) # Support for uploading empty files. if size == 0: headers['Content-Length'] = '0' if not headers.get('x-archive-size-hint'): headers['x-archive-size-hint'] = str(size) # Build IA-S3 URL. key = norm_filepath(filename).split('/')[-1] if key is None else key base_url = '{0.session.protocol}//s3.us.archive.org/{0.identifier}'.format(self) url = '{0}/{1}'.format( base_url, urllib.parse.quote(norm_filepath(key).lstrip('/').encode('utf-8'))) # Skip based on checksum. if checksum: md5_sum = get_md5(body) ia_file = self.get_file(key) if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum): log.info('{f} already exists: {u}'.format(f=key, u=url)) if verbose: print(' {f} already exists, skipping.'.format(f=key)) if delete: log.info( '{f} successfully uploaded to ' 'https://archive.org/download/{i}/{f} ' 'and verified, deleting ' 'local copy'.format(i=self.identifier, f=key)) body.close() os.remove(filename) # Return an empty response object if checksums match. # TODO: Is there a better way to handle this? body.close() return Response() # require the Content-MD5 header when delete is True. if verify or delete: if not md5_sum: md5_sum = get_md5(body) headers['Content-MD5'] = md5_sum def _build_request(): body.seek(0, os.SEEK_SET) if verbose: try: # hack to raise exception so we get some output for # empty files. if size == 0: raise Exception chunk_size = 1048576 expected_size = size / chunk_size + 1 chunks = chunk_generator(body, chunk_size) progress_generator = progress.bar( chunks, expected_size=expected_size, label=' uploading {f}: '.format(f=key)) data = IterableToFileAdapter(progress_generator, size) except: print(' uploading {f}'.format(f=key)) data = body else: data = body headers.update(self.session.headers) request = S3Request(method='PUT', url=url, headers=headers, data=data, metadata=metadata, access_key=access_key, secret_key=secret_key, queue_derive=queue_derive) return request if debug: prepared_request = self.session.prepare_request(_build_request()) body.close() return prepared_request else: try: error_msg = ('s3 is overloaded, sleeping for ' '{0} seconds and retrying. ' '{1} retries left.'.format(retries_sleep, retries)) while True: if retries > 0: if self.session.s3_is_overloaded(access_key): sleep(retries_sleep) log.info(error_msg) if verbose: print(' warning: {0}'.format(error_msg), file=sys.stderr) retries -= 1 continue request = _build_request() prepared_request = request.prepare() # chunked transfer-encoding is NOT supported by IA-S3. # It should NEVER be set. Requests adds it in certain # scenarios (e.g. if content-length is 0). Stop it. if prepared_request.headers.get('transfer-encoding') == 'chunked': del prepared_request.headers['transfer-encoding'] response = self.session.send(prepared_request, stream=True, **request_kwargs) if (response.status_code == 503) and (retries > 0): log.info(error_msg) if verbose: print(' warning: {0}'.format(error_msg), file=sys.stderr) sleep(retries_sleep) retries -= 1 continue else: if response.status_code == 503: log.info('maximum retries exceeded, upload failed.') break response.raise_for_status() log.info(u'uploaded {f} to {u}'.format(f=key, u=url)) if delete and response.status_code == 200: log.info( '{f} successfully uploaded to ' 'https://archive.org/download/{i}/{f} and verified, deleting ' 'local copy'.format(i=self.identifier, f=key)) body.close() os.remove(filename) body.close() return response except HTTPError as exc: body.close() msg = get_s3_xml_text(exc.response.content) error_msg = (' error uploading {0} to {1}, ' '{2}'.format(key, self.identifier, msg)) log.error(error_msg) if verbose: print(' error uploading {0}: {1}'.format(key, msg), file=sys.stderr) # Raise HTTPError with error message. raise type(exc)(error_msg, response=exc.response, request=exc.request)
def main(argv, session, cmd='copy'): args = docopt(__doc__, argv=argv) src_path = args['<src-identifier>/<src-file>'] dest_path = args['<dest-identifier>/<dest-file>'] # If src == dest, file get's deleted! try: assert src_path != dest_path except AssertionError: print('error: The source and destination files cannot be the same!', file=sys.stderr) sys.exit(1) global SRC_ITEM SRC_ITEM = session.get_item(src_path.split('/')[0]) # Validate args. s = Schema({ str: Use(bool), '<src-identifier>/<src-file>': And( str, And(And( str, lambda x: '/' in x, error='Destiantion not formatted correctly. See usage example.' ), assert_src_file_exists, error=('https://archive.org/download/{} does not exist. ' 'Please check the identifier and filepath and retry.'. format(src_path)))), '<dest-identifier>/<dest-file>': And(str, lambda x: '/' in x, error='Destiantion not formatted correctly. See usage example.'), '--metadata': Or(None, And(Use(get_args_dict), dict), error='--metadata must be formatted as --metadata="key:value"'), '--header': Or(None, And(Use(get_args_dict), dict), error='--header must be formatted as --header="key:value"'), }) try: args = s.validate(args) except SchemaError as exc: # This module is sometimes called by other modules. # Replace references to 'ia copy' in ___doc__ to 'ia {cmd}' for clarity. usage = printable_usage(__doc__.replace('ia copy', 'ia {}'.format(cmd))) print('{0}\n{1}'.format(str(exc), usage), file=sys.stderr) sys.exit(1) args['--header']['x-amz-copy-source'] = '/{}'.format(src_path) args['--header']['x-amz-metadata-directive'] = 'COPY' args['--header'] # Add keep-old-version by default. if 'x-archive-keep-old-version' not in args['--header']: args['--header']['x-archive-keep-old-version'] = '1' url = '{}//s3.us.archive.org/{}'.format(session.protocol, dest_path) req = ia.iarequest.S3Request(url=url, method='PUT', metadata=args['--metadata'], headers=args['--header'], access_key=session.access_key, secret_key=session.secret_key) p = req.prepare() r = session.send(p) if r.status_code != 200: try: msg = get_s3_xml_text(r.text) except Exception as e: msg = r.text print('error: failed to {} "{}" to "{}" - {}'.format( cmd, src_path, dest_path, msg)) sys.exit(1) elif cmd == 'copy': print('success: copied "{}" to "{}".'.format(src_path, dest_path)) else: return (r, SRC_FILE)
def main(argv, session, cmd='copy'): args = docopt(__doc__, argv=argv) src_path = args['<src-identifier>/<src-file>'] dest_path = args['<dest-identifier>/<dest-file>'] # If src == dest, file get's deleted! try: assert src_path != dest_path except AssertionError: print('error: The source and destination files cannot be the same!', file=sys.stderr) sys.exit(1) global SRC_ITEM SRC_ITEM = session.get_item(src_path.split('/')[0]) # Validate args. s = Schema({ str: Use(bool), '<src-identifier>/<src-file>': And(str, And(And(str, lambda x: '/' in x, error='Destination not formatted correctly. See usage example.'), assert_src_file_exists, error=( 'https://archive.org/download/{} does not exist. ' 'Please check the identifier and filepath and retry.'.format(src_path)))), '<dest-identifier>/<dest-file>': And(str, lambda x: '/' in x, error='Destination not formatted correctly. See usage example.'), '--metadata': Or(None, And(Use(get_args_dict), dict), error='--metadata must be formatted as --metadata="key:value"'), '--header': Or(None, And(Use(get_args_dict), dict), error='--header must be formatted as --header="key:value"'), }) try: args = s.validate(args) except SchemaError as exc: # This module is sometimes called by other modules. # Replace references to 'ia copy' in ___doc__ to 'ia {cmd}' for clarity. usage = printable_usage(__doc__.replace('ia copy', 'ia {}'.format(cmd))) print('{0}\n{1}'.format(str(exc), usage), file=sys.stderr) sys.exit(1) args['--header']['x-amz-copy-source'] = '/{}'.format(src_path) args['--header']['x-amz-metadata-directive'] = 'COPY' args['--header'] # Add keep-old-version by default. if 'x-archive-keep-old-version' not in args['--header']: args['--header']['x-archive-keep-old-version'] = '1' url = '{}//s3.us.archive.org/{}'.format(session.protocol, dest_path) req = ia.iarequest.S3Request(url=url, method='PUT', metadata=args['--metadata'], headers=args['--header'], access_key=session.access_key, secret_key=session.secret_key) p = req.prepare() r = session.send(p) if r.status_code != 200: try: msg = get_s3_xml_text(r.text) except Exception as e: msg = r.text print('error: failed to {} "{}" to "{}" - {}'.format( cmd, src_path, dest_path, msg)) sys.exit(1) elif cmd == 'copy': print('success: copied "{}" to "{}".'.format(src_path, dest_path)) else: return (r, SRC_FILE)
def upload_file(self, body, key=None, metadata=None, file_metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=None, verify=None, checksum=None, delete=None, retries=None, retries_sleep=None, debug=None, validate_identifier=None, request_kwargs=None): """Upload a single file to an item. The item will be created if it does not exist. :type body: Filepath or file-like object. :param body: File or data to be uploaded. :type key: str :param key: (optional) Remote filename. :type metadata: dict :param metadata: (optional) Metadata used to create a new item. :type file_metadata: dict :param file_metadata: (optional) File-level metadata to add to the files.xml entry for the file being uploaded. :type headers: dict :param headers: (optional) Add additional IA-S3 headers to request. :type queue_derive: bool :param queue_derive: (optional) Set to False to prevent an item from being derived after upload. :type verify: bool :param verify: (optional) Verify local MD5 checksum matches the MD5 checksum of the file received by IAS3. :type checksum: bool :param checksum: (optional) Skip based on checksum. :type delete: bool :param delete: (optional) Delete local file after the upload has been successfully verified. :type retries: int :param retries: (optional) Number of times to retry the given request if S3 returns a 503 SlowDown error. :type retries_sleep: int :param retries_sleep: (optional) Amount of time to sleep between ``retries``. :type verbose: bool :param verbose: (optional) Print progress to stdout. :type debug: bool :param debug: (optional) Set to True to print headers to stdout, and exit without sending the upload request. :type validate_identifier: bool :param validate_identifier: (optional) Set to True to validate the identifier before uploading the file. Usage:: >>> import internetarchive >>> item = internetarchive.Item('identifier') >>> item.upload_file('/path/to/image.jpg', ... key='photos/image1.jpg') True """ # Set defaults. headers = headers or {} metadata = metadata or {} file_metadata = file_metadata or {} access_key = access_key or self.session.access_key secret_key = secret_key or self.session.secret_key queue_derive = bool(queue_derive) verbose = bool(verbose) verify = bool(verify) delete = bool(delete) # Set checksum after delete. checksum = delete or checksum retries = retries or 0 retries_sleep = retries_sleep or 30 debug = bool(debug) validate_identifier = bool(validate_identifier) request_kwargs = request_kwargs or {} if 'timeout' not in request_kwargs: request_kwargs['timeout'] = 120 md5_sum = None _headers = headers.copy() if not hasattr(body, 'read'): filename = body body = open(body, 'rb') else: filename = key or body.name size = get_file_size(body) # Support for uploading empty files. if size == 0: _headers['Content-Length'] = '0' if not _headers.get('x-archive-size-hint'): _headers['x-archive-size-hint'] = str(size) # Build IA-S3 URL. if validate_identifier: validate_s3_identifier(self.identifier) key = norm_filepath(filename).split('/')[-1] if key is None else key base_url = f'{self.session.protocol}//s3.us.archive.org/{self.identifier}' url = f'{base_url}/{quote(norm_filepath(key).lstrip("/").encode("utf-8"))}' # Skip based on checksum. if checksum: md5_sum = get_md5(body) ia_file = self.get_file(key) if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum): log.info(f'{key} already exists: {url}') if verbose: print(f' {key} already exists, skipping.', file=sys.stderr) if delete: log.info( f'{key} successfully uploaded to ' f'https://archive.org/download/{self.identifier}/{key} ' 'and verified, deleting local copy') body.close() os.remove(filename) # Return an empty response object if checksums match. # TODO: Is there a better way to handle this? body.close() return Response() # require the Content-MD5 header when delete is True. if verify or delete: if not md5_sum: md5_sum = get_md5(body) _headers['Content-MD5'] = md5_sum def _build_request(): body.seek(0, os.SEEK_SET) if verbose: try: # hack to raise exception so we get some output for # empty files. if size == 0: raise Exception chunk_size = 1048576 expected_size = math.ceil(size / chunk_size) chunks = chunk_generator(body, chunk_size) progress_generator = tqdm(chunks, desc=f' uploading {key}', dynamic_ncols=True, total=expected_size, unit='MiB') data = IterableToFileAdapter(progress_generator, size) except: print(f' uploading {key}', file=sys.stderr) data = body else: data = body _headers.update(self.session.headers) request = S3Request(method='PUT', url=url, headers=_headers, data=data, metadata=metadata, file_metadata=file_metadata, access_key=access_key, secret_key=secret_key, queue_derive=queue_derive) return request if debug: prepared_request = self.session.prepare_request(_build_request()) body.close() return prepared_request else: try: while True: error_msg = ('s3 is overloaded, sleeping for ' f'{retries_sleep} seconds and retrying. ' f'{retries} retries left.') if retries > 0: if self.session.s3_is_overloaded( access_key=access_key): sleep(retries_sleep) log.info(error_msg) if verbose: print(f' warning: {error_msg}', file=sys.stderr) retries -= 1 continue request = _build_request() prepared_request = request.prepare() # chunked transfer-encoding is NOT supported by IA-S3. # It should NEVER be set. Requests adds it in certain # scenarios (e.g. if content-length is 0). Stop it. if prepared_request.headers.get( 'transfer-encoding') == 'chunked': del prepared_request.headers['transfer-encoding'] response = self.session.send(prepared_request, stream=True, **request_kwargs) if (response.status_code == 503) and (retries > 0): log.info(error_msg) if verbose: print(f' warning: {error_msg}', file=sys.stderr) sleep(retries_sleep) retries -= 1 continue else: if response.status_code == 503: log.info( 'maximum retries exceeded, upload failed.') break response.raise_for_status() log.info(f'uploaded {key} to {url}') if delete and response.status_code == 200: log.info( f'{key} successfully uploaded to ' f'https://archive.org/download/{self.identifier}/{key} and verified, ' 'deleting local copy') body.close() os.remove(filename) response.close() return response except HTTPError as exc: try: msg = get_s3_xml_text(exc.response.content) except ExpatError: # probably HTTP 500 error and response is invalid XML msg = ( 'IA S3 returned invalid XML ' f'(HTTP status code {exc.response.status_code}). ' 'This is a server side error which is either temporary, ' 'or requires the intervention of IA admins.') error_msg = f' error uploading {key} to {self.identifier}, {msg}' log.error(error_msg) if verbose: print(f' error uploading {key}: {msg}', file=sys.stderr) # Raise HTTPError with error message. raise type(exc)(error_msg, response=exc.response, request=exc.request) finally: body.close()
def main(argv, session): args = docopt(__doc__, argv=argv) # Validation error messages. invalid_id_msg = ( '<identifier> should be between 3 and 80 characters in length, and ' 'can only contain alphanumeric characters, underscores ( _ ), or ' 'dashes ( - )') # Validate args. s = Schema({ text_type: Use(lambda x: bool(x)), '<file>': list, '--format': list, '--glob': list, 'delete': bool, '<identifier>': Or(None, And(str, validate_ia_identifier, error=invalid_id_msg)), }) try: args = s.validate(args) except SchemaError as exc: print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr) sys.exit(1) verbose = True if not args['--quiet'] else False item = session.get_item(args['<identifier>']) if not item.exists: print('{0}: skipping, item does\'t exist.') # Files that cannot be deleted via S3. no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite'] if verbose: sys.stdout.write('Deleting files from {0}\n'.format(item.identifier)) if args['--all']: files = [f for f in item.get_files()] args['--cacade'] = True elif args['--glob']: files = item.get_files(glob_pattern=args['--glob']) elif args['--format']: files = item.get_files(formats=args['--format']) else: fnames = [] if args['<file>'] == ['-']: fnames = [f.strip().decode('utf-8') for f in sys.stdin] else: fnames = [f.strip().decode('utf-8') for f in args['<file>']] files = [f for f in [item.get_file(f) for f in fnames] if f] if not files: sys.stderr.write(' warning: no files found, nothing deleted.\n') sys.exit(1) for f in files: if not f: if verbose: sys.stderr.write(' error: "{0}" does not exist\n'.format( f.name)) sys.exit(1) if any(f.name.endswith(s) for s in no_delete): continue if args['--dry-run']: sys.stdout.write(' will delete: {0}/{1}\n'.format( item.identifier, f.name.encode('utf-8'))) continue resp = f.delete(verbose=verbose, cascade_delete=args['--cascade']) if resp.status_code != 204: msg = get_s3_xml_text(resp.content) sys.stderr.write(' error: {0} ({1})\n'.format( msg, resp.status_code)) sys.exit(1)
def main(argv: list[str] | None, session: ia.session.ArchiveSession, cmd: str = 'copy') -> tuple[Response, ia.files.File]: args = docopt(__doc__, argv=argv) src_path = args['<src-identifier>/<src-file>'] dest_path = args['<dest-identifier>/<dest-file>'] # If src == dest, file gets deleted! try: assert src_path != dest_path except AssertionError: print('error: The source and destination files cannot be the same!', file=sys.stderr) sys.exit(1) global SRC_ITEM SRC_ITEM = session.get_item(src_path.split('/')[0]) # type: ignore # Validate args. s = Schema({ str: Use(bool), '<src-identifier>/<src-file>': And( str, And(And( str, lambda x: '/' in x, error='Destination not formatted correctly. See usage example.' ), assert_src_file_exists, error= (f'https://{session.host}/download/{src_path} does not exist. ' 'Please check the identifier and filepath and retry.'))), '<dest-identifier>/<dest-file>': And(str, lambda x: '/' in x, error='Destination not formatted correctly. See usage example.'), '--metadata': Or(None, And(Use(get_args_dict), dict), error='--metadata must be formatted as --metadata="key:value"'), '--replace-metadata': Use(bool), '--header': Or(None, And(Use(get_args_dict), dict), error='--header must be formatted as --header="key:value"'), '--ignore-file-metadata': Use(bool), }) try: args = s.validate(args) except SchemaError as exc: # This module is sometimes called by other modules. # Replace references to 'ia copy' in ___doc__ to 'ia {cmd}' for clarity. usage = printable_usage(__doc__.replace('ia copy', f'ia {cmd}')) print(f'{exc}\n{usage}', file=sys.stderr) sys.exit(1) args['--header']['x-amz-copy-source'] = f'/{quote(src_path)}' # Copy the old metadata verbatim if no additional metadata is supplied, # else combine the old and the new metadata in a sensible manner. if args['--metadata'] or args['--replace-metadata']: args['--header']['x-amz-metadata-directive'] = 'REPLACE' else: args['--header']['x-amz-metadata-directive'] = 'COPY' # New metadata takes precedence over old metadata. if not args['--replace-metadata']: args['--metadata'] = merge_dictionaries( SRC_ITEM.metadata, # type: ignore args['--metadata']) # File metadata is copied by default but can be dropped. file_metadata = None if args[ '--ignore-file-metadata'] else SRC_FILE.metadata # type: ignore # Add keep-old-version by default. if not args['--header'].get( 'x-archive-keep-old-version') and not args['--no-backup']: args['--header']['x-archive-keep-old-version'] = '1' url = f'{session.protocol}//s3.us.archive.org/{quote(dest_path)}' queue_derive = True if args['--no-derive'] is False else False req = ia.iarequest.S3Request(url=url, method='PUT', metadata=args['--metadata'], file_metadata=file_metadata, headers=args['--header'], queue_derive=queue_derive, access_key=session.access_key, secret_key=session.secret_key) p = req.prepare() r = session.send(p) if r.status_code != 200: try: msg = get_s3_xml_text(r.text) except Exception as e: msg = r.text print(f'error: failed to {cmd} "{src_path}" to "{dest_path}" - {msg}', file=sys.stderr) sys.exit(1) elif cmd == 'copy': print(f'success: copied "{src_path}" to "{dest_path}".', file=sys.stderr) return (r, SRC_FILE) # type: ignore
def main(argv, session: ArchiveSession) -> None: args = docopt(__doc__, argv=argv) # Validation error messages. invalid_id_msg = ( '<identifier> should be between 3 and 80 characters in length, and ' 'can only contain alphanumeric characters, underscores ( _ ), or ' 'dashes ( - )') # Validate args. s = Schema({ str: Use(bool), '<file>': list, '--format': list, '--header': Or(None, And(Use(get_args_dict), dict), error='--header must be formatted as --header="key:value"'), '--glob': list, 'delete': bool, '--retries': Use(lambda i: int(i[0])), '<identifier>': str, }) try: args = s.validate(args) except SchemaError as exc: print(f'{exc}\n{printable_usage(__doc__)}', file=sys.stderr) sys.exit(1) verbose = True if not args['--quiet'] else False item = session.get_item(args['<identifier>']) if not item.exists: print('{0}: skipping, item does\'t exist.', file=sys.stderr) # Files that cannot be deleted via S3. no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite'] # Add keep-old-version by default. if not args['--header'].get( 'x-archive-keep-old-version') and not args['--no-backup']: args['--header']['x-archive-keep-old-version'] = '1' if verbose: print(f'Deleting files from {item.identifier}', file=sys.stderr) if args['--all']: files = list(item.get_files()) args['--cascade'] = True elif args['--glob']: files = item.get_files(glob_pattern=args['--glob']) elif args['--format']: files = item.get_files(formats=args['--format']) else: fnames = [] if args['<file>'] == ['-']: fnames = [f.strip() for f in sys.stdin] else: fnames = [f.strip() for f in args['<file>']] files = list(item.get_files(fnames)) if not files: print(' warning: no files found, nothing deleted.', file=sys.stderr) sys.exit(1) errors = False for f in files: if not f: if verbose: print(f' error: "{f.name}" does not exist', file=sys.stderr) errors = True if any(f.name.endswith(s) for s in no_delete): continue if args['--dry-run']: print(f' will delete: {item.identifier}/{f.name}', file=sys.stderr) continue try: resp = f.delete(verbose=verbose, cascade_delete=args['--cascade'], headers=args['--header'], retries=args['--retries']) except requests.exceptions.RetryError as e: print(f' error: max retries exceeded for {f.name}', file=sys.stderr) errors = True continue if resp.status_code != 204: errors = True msg = get_s3_xml_text(resp.content) print(f' error: {msg} ({resp.status_code})', file=sys.stderr) continue if errors is True: sys.exit(1)