Esempio n. 1
0
def _upload_files(item, files, upload_kwargs, prev_identifier=None, archive_session=None):
    """Helper function for calling :meth:`Item.upload`"""
    responses = []
    if (upload_kwargs['verbose']) and (prev_identifier != item.identifier):
        print('{0}:'.format(item.identifier))

    try:
        response = item.upload(files, **upload_kwargs)
        responses += response
    except HTTPError as exc:
        responses += [exc.response]
    finally:
        # Debug mode.
        if upload_kwargs['debug']:
            for i, r in enumerate(responses):
                if i != 0:
                    print('---')
                headers = '\n'.join(
                    [' {0}:{1}'.format(k, v) for (k, v) in r.headers.items()]
                )
                print('Endpoint:\n {0}\n'.format(r.url))
                print('HTTP Headers:\n{0}'.format(headers))
                return responses

        # Format error message for any non 200 responses that
        # we haven't caught yet,and write to stderr.
        if responses and responses[-1] and responses[-1].status_code != 200:
            if not responses[-1].status_code:
                return responses
            filename = responses[-1].request.url.split('/')[-1]
            msg = get_s3_xml_text(responses[-1].content)
            print(' error uploading {0}: {2}'.format(filename, msg), file=sys.stderr)

    return responses
Esempio n. 2
0
def _upload_files(item, files, upload_kwargs, prev_identifier=None, archive_session=None):
    """Helper function for calling :meth:`Item.upload`"""
    responses = []
    if (upload_kwargs['verbose']) and (prev_identifier != item.identifier):
        print('{0}:'.format(item.identifier))

    try:
        response = item.upload(files, **upload_kwargs)
        responses += response
    except HTTPError as exc:
        responses += [exc.response]
    finally:
        # Debug mode.
        if upload_kwargs['debug']:
            for i, r in enumerate(responses):
                if i != 0:
                    print('---')
                headers = '\n'.join(
                    [' {0}:{1}'.format(k, v) for (k, v) in r.headers.items()]
                )
                print('Endpoint:\n {0}\n'.format(r.url))
                print('HTTP Headers:\n{0}'.format(headers))
                return responses

        # Format error message for any non 200 responses that
        # we haven't caught yet,and write to stderr.
        if responses and responses[-1] and responses[-1].status_code != 200:
            if not responses[-1].status_code:
                return responses
            filename = responses[-1].request.url.split('/')[-1]
            msg = get_s3_xml_text(responses[-1].content)
            print(' error uploading {0}: {2}'.format(filename, msg), file=sys.stderr)

    return responses
Esempio n. 3
0
    def upload_file(self, body,
                    key=None,
                    metadata=None,
                    headers=None,
                    access_key=None,
                    secret_key=None,
                    queue_derive=None,
                    verbose=None,
                    verify=None,
                    checksum=None,
                    delete=None,
                    retries=None,
                    retries_sleep=None,
                    debug=None,
                    request_kwargs=None):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = {} if headers is None else headers
        metadata = {} if metadata is None else metadata
        access_key = self.session.access_key if access_key is None else access_key
        secret_key = self.session.secret_key if secret_key is None else secret_key
        queue_derive = True if queue_derive is None else queue_derive
        verbose = False if verbose is None else verbose
        verify = True if verify is None else verify
        delete = False if delete is None else delete
        # Set checksum after delete.
        checksum = True if delete or checksum is None else checksum
        retries = 0 if retries is None else retries
        retries_sleep = 30 if retries_sleep is None else retries_sleep
        debug = False if debug is None else debug
        request_kwargs = {} if request_kwargs is None else request_kwargs
        md5_sum = None

        if not hasattr(body, 'read'):
            body = open(body, 'rb')

        size = get_file_size(body)

        if not headers.get('x-archive-size-hint'):
            headers['x-archive-size-hint'] = size

        # Build IA-S3 URL.
        key = body.name.split('/')[-1] if key is None else key
        base_url = '{0.session.protocol}//s3.us.archive.org/{0.identifier}'.format(self)
        url = '{0}/{1}'.format(
            base_url, urllib.parse.quote(key.lstrip('/').encode('utf-8')))

        # Skip based on checksum.
        if checksum:
            md5_sum = get_md5(body)
            ia_file = self.get_file(key)
            if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum):
                log.info('{f} already exists: {u}'.format(f=key, u=url))
                if verbose:
                    print(' {f} already exists, skipping.'.format(f=key))
                if delete:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} '
                        'and verified, deleting '
                        'local copy'.format(i=self.identifier,
                                            f=key))
                    os.remove(body.name)
                # Return an empty response object if checksums match.
                # TODO: Is there a better way to handle this?
                return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            if not md5_sum:
                md5_sum = get_md5(body)
            headers['Content-MD5'] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    chunk_size = 1048576
                    expected_size = size / chunk_size + 1
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = progress.bar(
                        chunks,
                        expected_size=expected_size,
                        label=' uploading {f}: '.format(f=key))
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(' uploading {f}'.format(f=key))
                    data = body
            else:
                data = body

            request = S3Request(method='PUT',
                                url=url,
                                headers=headers,
                                data=data,
                                metadata=metadata,
                                access_key=access_key,
                                secret_key=secret_key,
                                queue_derive=queue_derive)
            return request

        if debug:
            return _build_request()
        else:
            try:
                error_msg = ('s3 is overloaded, sleeping for '
                             '{0} seconds and retrying. '
                             '{1} retries left.'.format(retries_sleep, retries))
                while True:
                    if retries > 0:
                        if self.session.s3_is_overloaded(access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(' warning: {0}'.format(error_msg), file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()
                    response = self.session.send(prepared_request,
                                                 stream=True,
                                                 **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(' warning: {0}'.format(error_msg), file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info('maximum retries exceeded, upload failed.')
                        break
                response.raise_for_status()
                log.info('uploaded {f} to {u}'.format(f=key, u=url))
                if delete and response.status_code == 200:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} and verified, deleting '
                        'local copy'.format(i=self.identifier,
                                            f=key))
                    os.remove(body.name)
                return response
            except HTTPError as exc:
                msg = get_s3_xml_text(exc.response.content)
                error_msg = (' error uploading {0} to {1}, '
                             '{2}'.format(key, self.identifier, msg))
                log.error(error_msg)
                if verbose:
                    print(' error uploading {0}: {1}'.format(key, msg), file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg, response=exc.response, request=exc.request)
Esempio n. 4
0
def main(argv, session):
    args = docopt(__doc__, argv=argv)

    # Validation error messages.
    invalid_id_msg = ('<identifier> should be between 3 and 80 characters in length, and '
                      'can only contain alphanumeric characters, underscores ( _ ), or '
                      'dashes ( - )')

    # Validate args.
    s = Schema({
        six.text_type: Use(lambda x: bool(x)),
        '<file>': And(list, Use(
            lambda x: convert_str_list_to_unicode(x) if six.PY2 else x)),
        '--format': list,
        '--header': Or(None, And(Use(get_args_dict), dict),
                       error='--header must be formatted as --header="key:value"'),
        '--glob': list,
        'delete': bool,
        '--retries': Use(lambda i: int(i[0])),
        '<identifier>': str,
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr)
        sys.exit(1)

    verbose = True if not args['--quiet'] else False
    item = session.get_item(args['<identifier>'])
    if not item.exists:
        print('{0}: skipping, item does\'t exist.')

    # Files that cannot be deleted via S3.
    no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite']

    # Add keep-old-version by default.
    if 'x-archive-keep-old-version' not in args['--header']:
        args['--header']['x-archive-keep-old-version'] = '1'

    if verbose:
        sys.stdout.write('Deleting files from {0}\n'.format(item.identifier))

    if args['--all']:
        files = [f for f in item.get_files()]
        args['--cacade'] = True
    elif args['--glob']:
        files = item.get_files(glob_pattern=args['--glob'])
    elif args['--format']:
        files = item.get_files(formats=args['--format'])
    else:
        fnames = []
        if args['<file>'] == ['-']:
            if six.PY2:
                fnames = convert_str_list_to_unicode([f.strip() for f in sys.stdin])
            else:
                fnames = [f.strip() for f in sys.stdin]
        else:
            fnames = [f.strip() for f in args['<file>']]

        files = list(item.get_files(fnames))

    if not files:
        sys.stderr.write(' warning: no files found, nothing deleted.\n')
        sys.exit(1)

    errors = False

    for f in files:
        if not f:
            if verbose:
                sys.stderr.write(' error: "{0}" does not exist\n'.format(f.name))
            errors = True
        if any(f.name.endswith(s) for s in no_delete):
            continue
        if args['--dry-run']:
            sys.stdout.write(' will delete: {0}/{1}\n'.format(item.identifier,
                                                              f.name.encode('utf-8')))
            continue
        try:
            resp = f.delete(verbose=verbose,
                            cascade_delete=args['--cascade'],
                            headers=args['--headers'],
                            retries=args['--retries'])
        except requests.exceptions.RetryError as e:
            print(' error: max retries exceeded for {0}'.format(f.name), file=sys.stderr)
            errors = True
            continue

        if resp.status_code != 204:
            errors = True
            msg = get_s3_xml_text(resp.content)
            print(' error: {0} ({1})'.format(msg, resp.status_code), file=sys.stderr)
            continue

    if errors is True:
        sys.exit(1)
Esempio n. 5
0
def main(argv, session):
    args = docopt(__doc__, argv=argv)

    # Validation error messages.
    invalid_id_msg = ('<identifier> should be between 3 and 80 characters in length, and '
                      'can only contain alphanumeric characters, underscores ( _ ), or '
                      'dashes ( - )')

    # Validate args.
    s = Schema({
        six.text_type: Use(lambda x: bool(x)),
        '<file>': And(list, Use(
            lambda x: convert_str_list_to_unicode(x) if six.PY2 else x)),
        '--format': list,
        '--header': Or(None, And(Use(get_args_dict), dict),
                       error='--header must be formatted as --header="key:value"'),
        '--glob': list,
        'delete': bool,
        '--retries': Use(lambda i: int(i[0])),
        '<identifier>': str,
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr)
        sys.exit(1)

    verbose = True if not args['--quiet'] else False
    item = session.get_item(args['<identifier>'])
    if not item.exists:
        print('{0}: skipping, item does\'t exist.')

    # Files that cannot be deleted via S3.
    no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite']

    # Add keep-old-version by default.
    if 'x-archive-keep-old-version' not in args['--header']:
        args['--header']['x-archive-keep-old-version'] = '1'

    if verbose:
        sys.stdout.write('Deleting files from {0}\n'.format(item.identifier))

    if args['--all']:
        files = [f for f in item.get_files()]
        args['--cacade'] = True
    elif args['--glob']:
        files = item.get_files(glob_pattern=args['--glob'])
    elif args['--format']:
        files = item.get_files(formats=args['--format'])
    else:
        fnames = []
        if args['<file>'] == ['-']:
            if six.PY2:
                fnames = convert_str_list_to_unicode([f.strip() for f in sys.stdin])
            else:
                fnames = [f.strip() for f in sys.stdin]
        else:
            fnames = [f.strip() for f in args['<file>']]

        files = list(item.get_files(fnames))

    if not files:
        sys.stderr.write(' warning: no files found, nothing deleted.\n')
        sys.exit(1)

    errors = False

    for f in files:
        if not f:
            if verbose:
                sys.stderr.write(' error: "{0}" does not exist\n'.format(f.name))
            errors = True
        if any(f.name.endswith(s) for s in no_delete):
            continue
        if args['--dry-run']:
            sys.stdout.write(' will delete: {0}/{1}\n'.format(item.identifier,
                                                              f.name.encode('utf-8')))
            continue
        try:
            resp = f.delete(verbose=verbose,
                            cascade_delete=args['--cascade'],
                            headers=args['--header'],
                            retries=args['--retries'])
        except requests.exceptions.RetryError as e:
            print(' error: max retries exceeded for {0}'.format(f.name), file=sys.stderr)
            errors = True
            continue

        if resp.status_code != 204:
            errors = True
            msg = get_s3_xml_text(resp.content)
            print(' error: {0} ({1})'.format(msg, resp.status_code), file=sys.stderr)
            continue

    if errors is True:
        sys.exit(1)
Esempio n. 6
0
    def upload_file(self, body,
                    key=None,
                    metadata=None,
                    headers=None,
                    access_key=None,
                    secret_key=None,
                    queue_derive=None,
                    verbose=None,
                    verify=None,
                    checksum=None,
                    delete=None,
                    retries=None,
                    retries_sleep=None,
                    debug=None,
                    request_kwargs=None):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = {} if headers is None else headers
        metadata = {} if metadata is None else metadata
        access_key = self.session.access_key if access_key is None else access_key
        secret_key = self.session.secret_key if secret_key is None else secret_key
        queue_derive = True if queue_derive is None else queue_derive
        verbose = False if verbose is None else verbose
        verify = True if verify is None else verify
        delete = False if delete is None else delete
        # Set checksum after delete.
        checksum = True if delete else checksum
        retries = 0 if retries is None else retries
        retries_sleep = 30 if retries_sleep is None else retries_sleep
        debug = False if debug is None else debug
        request_kwargs = {} if request_kwargs is None else request_kwargs
        if 'timeout' not in request_kwargs:
            request_kwargs['timeout'] = 120
        md5_sum = None

        if not hasattr(body, 'read'):
            filename = body
            body = open(body, 'rb')
        else:
            if key:
                filename = key
            else:
                filename = body.name

        size = get_file_size(body)

        # Support for uploading empty files.
        if size == 0:
            headers['Content-Length'] = '0'

        if not headers.get('x-archive-size-hint'):
            headers['x-archive-size-hint'] = str(size)

        # Build IA-S3 URL.
        key = norm_filepath(filename).split('/')[-1] if key is None else key
        base_url = '{0.session.protocol}//s3.us.archive.org/{0.identifier}'.format(self)
        url = '{0}/{1}'.format(
            base_url, urllib.parse.quote(norm_filepath(key).lstrip('/').encode('utf-8')))

        # Skip based on checksum.
        if checksum:
            md5_sum = get_md5(body)
            ia_file = self.get_file(key)
            if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum):
                log.info('{f} already exists: {u}'.format(f=key, u=url))
                if verbose:
                    print(' {f} already exists, skipping.'.format(f=key))
                if delete:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} '
                        'and verified, deleting '
                        'local copy'.format(i=self.identifier,
                                            f=key))
                    body.close()
                    os.remove(filename)
                # Return an empty response object if checksums match.
                # TODO: Is there a better way to handle this?
                body.close()
                return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            if not md5_sum:
                md5_sum = get_md5(body)
            headers['Content-MD5'] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    # hack to raise exception so we get some output for
                    # empty files.
                    if size == 0:
                        raise Exception

                    chunk_size = 1048576
                    expected_size = size / chunk_size + 1
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = progress.bar(
                        chunks,
                        expected_size=expected_size,
                        label=' uploading {f}: '.format(f=key))
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(' uploading {f}'.format(f=key))
                    data = body
            else:
                data = body

            headers.update(self.session.headers)
            request = S3Request(method='PUT',
                                url=url,
                                headers=headers,
                                data=data,
                                metadata=metadata,
                                access_key=access_key,
                                secret_key=secret_key,
                                queue_derive=queue_derive)
            return request

        if debug:
            prepared_request = self.session.prepare_request(_build_request())
            body.close()
            return prepared_request
        else:
            try:
                error_msg = ('s3 is overloaded, sleeping for '
                             '{0} seconds and retrying. '
                             '{1} retries left.'.format(retries_sleep, retries))
                while True:
                    if retries > 0:
                        if self.session.s3_is_overloaded(access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(' warning: {0}'.format(error_msg), file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()

                    # chunked transfer-encoding is NOT supported by IA-S3.
                    # It should NEVER be set. Requests adds it in certain
                    # scenarios (e.g. if content-length is 0). Stop it.
                    if prepared_request.headers.get('transfer-encoding') == 'chunked':
                        del prepared_request.headers['transfer-encoding']

                    response = self.session.send(prepared_request,
                                                 stream=True,
                                                 **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(' warning: {0}'.format(error_msg), file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info('maximum retries exceeded, upload failed.')
                        break
                response.raise_for_status()
                log.info(u'uploaded {f} to {u}'.format(f=key, u=url))
                if delete and response.status_code == 200:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} and verified, deleting '
                        'local copy'.format(i=self.identifier, f=key))
                    body.close()
                    os.remove(filename)
                body.close()
                return response
            except HTTPError as exc:
                body.close()
                msg = get_s3_xml_text(exc.response.content)
                error_msg = (' error uploading {0} to {1}, '
                             '{2}'.format(key, self.identifier, msg))
                log.error(error_msg)
                if verbose:
                    print(' error uploading {0}: {1}'.format(key, msg), file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg, response=exc.response, request=exc.request)
Esempio n. 7
0
def main(argv, session, cmd='copy'):
    args = docopt(__doc__, argv=argv)
    src_path = args['<src-identifier>/<src-file>']
    dest_path = args['<dest-identifier>/<dest-file>']

    # If src == dest, file get's deleted!
    try:
        assert src_path != dest_path
    except AssertionError:
        print('error: The source and destination files cannot be the same!',
              file=sys.stderr)
        sys.exit(1)

    global SRC_ITEM
    SRC_ITEM = session.get_item(src_path.split('/')[0])

    # Validate args.
    s = Schema({
        str:
        Use(bool),
        '<src-identifier>/<src-file>':
        And(
            str,
            And(And(
                str,
                lambda x: '/' in x,
                error='Destiantion not formatted correctly. See usage example.'
            ),
                assert_src_file_exists,
                error=('https://archive.org/download/{} does not exist. '
                       'Please check the identifier and filepath and retry.'.
                       format(src_path)))),
        '<dest-identifier>/<dest-file>':
        And(str,
            lambda x: '/' in x,
            error='Destiantion not formatted correctly. See usage example.'),
        '--metadata':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--metadata must be formatted as --metadata="key:value"'),
        '--header':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--header must be formatted as --header="key:value"'),
    })

    try:
        args = s.validate(args)
    except SchemaError as exc:
        # This module is sometimes called by other modules.
        # Replace references to 'ia copy' in ___doc__ to 'ia {cmd}' for clarity.
        usage = printable_usage(__doc__.replace('ia copy',
                                                'ia {}'.format(cmd)))
        print('{0}\n{1}'.format(str(exc), usage), file=sys.stderr)
        sys.exit(1)

    args['--header']['x-amz-copy-source'] = '/{}'.format(src_path)
    args['--header']['x-amz-metadata-directive'] = 'COPY'
    args['--header']
    # Add keep-old-version by default.
    if 'x-archive-keep-old-version' not in args['--header']:
        args['--header']['x-archive-keep-old-version'] = '1'

    url = '{}//s3.us.archive.org/{}'.format(session.protocol, dest_path)
    req = ia.iarequest.S3Request(url=url,
                                 method='PUT',
                                 metadata=args['--metadata'],
                                 headers=args['--header'],
                                 access_key=session.access_key,
                                 secret_key=session.secret_key)
    p = req.prepare()
    r = session.send(p)
    if r.status_code != 200:
        try:
            msg = get_s3_xml_text(r.text)
        except Exception as e:
            msg = r.text
        print('error: failed to {} "{}" to "{}" - {}'.format(
            cmd, src_path, dest_path, msg))
        sys.exit(1)
    elif cmd == 'copy':
        print('success: copied "{}" to "{}".'.format(src_path, dest_path))
    else:
        return (r, SRC_FILE)
Esempio n. 8
0
def main(argv, session, cmd='copy'):
    args = docopt(__doc__, argv=argv)
    src_path = args['<src-identifier>/<src-file>']
    dest_path = args['<dest-identifier>/<dest-file>']

    # If src == dest, file get's deleted!
    try:
        assert src_path != dest_path
    except AssertionError:
        print('error: The source and destination files cannot be the same!',
              file=sys.stderr)
        sys.exit(1)

    global SRC_ITEM
    SRC_ITEM = session.get_item(src_path.split('/')[0])

    # Validate args.
    s = Schema({
        str: Use(bool),
        '<src-identifier>/<src-file>': And(str, And(And(str, lambda x: '/' in x,
            error='Destination not formatted correctly. See usage example.'),
            assert_src_file_exists, error=(
            'https://archive.org/download/{} does not exist. '
            'Please check the identifier and filepath and retry.'.format(src_path)))),
        '<dest-identifier>/<dest-file>': And(str, lambda x: '/' in x,
            error='Destination not formatted correctly. See usage example.'),
        '--metadata': Or(None, And(Use(get_args_dict), dict),
                         error='--metadata must be formatted as --metadata="key:value"'),
        '--header': Or(None, And(Use(get_args_dict), dict),
                       error='--header must be formatted as --header="key:value"'),
    })

    try:
        args = s.validate(args)
    except SchemaError as exc:
        # This module is sometimes called by other modules.
        # Replace references to 'ia copy' in ___doc__ to 'ia {cmd}' for clarity.
        usage = printable_usage(__doc__.replace('ia copy', 'ia {}'.format(cmd)))
        print('{0}\n{1}'.format(str(exc), usage), file=sys.stderr)
        sys.exit(1)

    args['--header']['x-amz-copy-source'] = '/{}'.format(src_path)
    args['--header']['x-amz-metadata-directive'] = 'COPY'
    args['--header']
    # Add keep-old-version by default.
    if 'x-archive-keep-old-version' not in args['--header']:
        args['--header']['x-archive-keep-old-version'] = '1'

    url = '{}//s3.us.archive.org/{}'.format(session.protocol, dest_path)
    req = ia.iarequest.S3Request(url=url,
                                 method='PUT',
                                 metadata=args['--metadata'],
                                 headers=args['--header'],
                                 access_key=session.access_key,
                                 secret_key=session.secret_key)
    p = req.prepare()
    r = session.send(p)
    if r.status_code != 200:
        try:
            msg = get_s3_xml_text(r.text)
        except Exception as e:
            msg = r.text
        print('error: failed to {} "{}" to "{}" - {}'.format(
            cmd, src_path, dest_path, msg))
        sys.exit(1)
    elif cmd == 'copy':
        print('success: copied "{}" to "{}".'.format(src_path, dest_path))
    else:
        return (r, SRC_FILE)
Esempio n. 9
0
    def upload_file(self,
                    body,
                    key=None,
                    metadata=None,
                    file_metadata=None,
                    headers=None,
                    access_key=None,
                    secret_key=None,
                    queue_derive=None,
                    verbose=None,
                    verify=None,
                    checksum=None,
                    delete=None,
                    retries=None,
                    retries_sleep=None,
                    debug=None,
                    validate_identifier=None,
                    request_kwargs=None):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type file_metadata: dict
        :param file_metadata: (optional) File-level metadata to add to
                              the files.xml entry for the file being
                              uploaded.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        :type validate_identifier: bool
        :param validate_identifier: (optional) Set to True to validate the identifier before
                                    uploading the file.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = headers or {}
        metadata = metadata or {}
        file_metadata = file_metadata or {}
        access_key = access_key or self.session.access_key
        secret_key = secret_key or self.session.secret_key
        queue_derive = bool(queue_derive)
        verbose = bool(verbose)
        verify = bool(verify)
        delete = bool(delete)
        # Set checksum after delete.
        checksum = delete or checksum
        retries = retries or 0
        retries_sleep = retries_sleep or 30
        debug = bool(debug)
        validate_identifier = bool(validate_identifier)
        request_kwargs = request_kwargs or {}
        if 'timeout' not in request_kwargs:
            request_kwargs['timeout'] = 120
        md5_sum = None

        _headers = headers.copy()

        if not hasattr(body, 'read'):
            filename = body
            body = open(body, 'rb')
        else:
            filename = key or body.name

        size = get_file_size(body)

        # Support for uploading empty files.
        if size == 0:
            _headers['Content-Length'] = '0'

        if not _headers.get('x-archive-size-hint'):
            _headers['x-archive-size-hint'] = str(size)

        # Build IA-S3 URL.
        if validate_identifier:
            validate_s3_identifier(self.identifier)
        key = norm_filepath(filename).split('/')[-1] if key is None else key
        base_url = f'{self.session.protocol}//s3.us.archive.org/{self.identifier}'
        url = f'{base_url}/{quote(norm_filepath(key).lstrip("/").encode("utf-8"))}'

        # Skip based on checksum.
        if checksum:
            md5_sum = get_md5(body)
            ia_file = self.get_file(key)
            if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum):
                log.info(f'{key} already exists: {url}')
                if verbose:
                    print(f' {key} already exists, skipping.', file=sys.stderr)
                if delete:
                    log.info(
                        f'{key} successfully uploaded to '
                        f'https://archive.org/download/{self.identifier}/{key} '
                        'and verified, deleting local copy')
                    body.close()
                    os.remove(filename)
                # Return an empty response object if checksums match.
                # TODO: Is there a better way to handle this?
                body.close()
                return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            if not md5_sum:
                md5_sum = get_md5(body)
            _headers['Content-MD5'] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    # hack to raise exception so we get some output for
                    # empty files.
                    if size == 0:
                        raise Exception

                    chunk_size = 1048576
                    expected_size = math.ceil(size / chunk_size)
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = tqdm(chunks,
                                              desc=f' uploading {key}',
                                              dynamic_ncols=True,
                                              total=expected_size,
                                              unit='MiB')
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(f' uploading {key}', file=sys.stderr)
                    data = body
            else:
                data = body

            _headers.update(self.session.headers)
            request = S3Request(method='PUT',
                                url=url,
                                headers=_headers,
                                data=data,
                                metadata=metadata,
                                file_metadata=file_metadata,
                                access_key=access_key,
                                secret_key=secret_key,
                                queue_derive=queue_derive)
            return request

        if debug:
            prepared_request = self.session.prepare_request(_build_request())
            body.close()
            return prepared_request
        else:
            try:
                while True:
                    error_msg = ('s3 is overloaded, sleeping for '
                                 f'{retries_sleep} seconds and retrying. '
                                 f'{retries} retries left.')
                    if retries > 0:
                        if self.session.s3_is_overloaded(
                                access_key=access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(f' warning: {error_msg}',
                                      file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()

                    # chunked transfer-encoding is NOT supported by IA-S3.
                    # It should NEVER be set. Requests adds it in certain
                    # scenarios (e.g. if content-length is 0). Stop it.
                    if prepared_request.headers.get(
                            'transfer-encoding') == 'chunked':
                        del prepared_request.headers['transfer-encoding']

                    response = self.session.send(prepared_request,
                                                 stream=True,
                                                 **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(f' warning: {error_msg}', file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info(
                                'maximum retries exceeded, upload failed.')
                        break
                response.raise_for_status()
                log.info(f'uploaded {key} to {url}')
                if delete and response.status_code == 200:
                    log.info(
                        f'{key} successfully uploaded to '
                        f'https://archive.org/download/{self.identifier}/{key} and verified, '
                        'deleting local copy')
                    body.close()
                    os.remove(filename)
                response.close()
                return response
            except HTTPError as exc:
                try:
                    msg = get_s3_xml_text(exc.response.content)
                except ExpatError:  # probably HTTP 500 error and response is invalid XML
                    msg = (
                        'IA S3 returned invalid XML '
                        f'(HTTP status code {exc.response.status_code}). '
                        'This is a server side error which is either temporary, '
                        'or requires the intervention of IA admins.')

                error_msg = f' error uploading {key} to {self.identifier}, {msg}'
                log.error(error_msg)
                if verbose:
                    print(f' error uploading {key}: {msg}', file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg,
                                response=exc.response,
                                request=exc.request)
            finally:
                body.close()
Esempio n. 10
0
def main(argv, session):
    args = docopt(__doc__, argv=argv)

    # Validation error messages.
    invalid_id_msg = (
        '<identifier> should be between 3 and 80 characters in length, and '
        'can only contain alphanumeric characters, underscores ( _ ), or '
        'dashes ( - )')

    # Validate args.
    s = Schema({
        text_type:
        Use(lambda x: bool(x)),
        '<file>':
        list,
        '--format':
        list,
        '--glob':
        list,
        'delete':
        bool,
        '<identifier>':
        Or(None, And(str, validate_ia_identifier, error=invalid_id_msg)),
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)),
              file=sys.stderr)
        sys.exit(1)

    verbose = True if not args['--quiet'] else False
    item = session.get_item(args['<identifier>'])
    if not item.exists:
        print('{0}: skipping, item does\'t exist.')

    # Files that cannot be deleted via S3.
    no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite']

    if verbose:
        sys.stdout.write('Deleting files from {0}\n'.format(item.identifier))

    if args['--all']:
        files = [f for f in item.get_files()]
        args['--cacade'] = True
    elif args['--glob']:
        files = item.get_files(glob_pattern=args['--glob'])
    elif args['--format']:
        files = item.get_files(formats=args['--format'])
    else:
        fnames = []
        if args['<file>'] == ['-']:
            fnames = [f.strip().decode('utf-8') for f in sys.stdin]
        else:
            fnames = [f.strip().decode('utf-8') for f in args['<file>']]

        files = [f for f in [item.get_file(f) for f in fnames] if f]

    if not files:
        sys.stderr.write(' warning: no files found, nothing deleted.\n')
        sys.exit(1)

    for f in files:
        if not f:
            if verbose:
                sys.stderr.write(' error: "{0}" does not exist\n'.format(
                    f.name))
            sys.exit(1)
        if any(f.name.endswith(s) for s in no_delete):
            continue
        if args['--dry-run']:
            sys.stdout.write(' will delete: {0}/{1}\n'.format(
                item.identifier, f.name.encode('utf-8')))
            continue
        resp = f.delete(verbose=verbose, cascade_delete=args['--cascade'])
        if resp.status_code != 204:
            msg = get_s3_xml_text(resp.content)
            sys.stderr.write(' error: {0} ({1})\n'.format(
                msg, resp.status_code))
            sys.exit(1)
Esempio n. 11
0
def main(argv: list[str] | None,
         session: ia.session.ArchiveSession,
         cmd: str = 'copy') -> tuple[Response, ia.files.File]:
    args = docopt(__doc__, argv=argv)
    src_path = args['<src-identifier>/<src-file>']
    dest_path = args['<dest-identifier>/<dest-file>']

    # If src == dest, file gets deleted!
    try:
        assert src_path != dest_path
    except AssertionError:
        print('error: The source and destination files cannot be the same!',
              file=sys.stderr)
        sys.exit(1)

    global SRC_ITEM
    SRC_ITEM = session.get_item(src_path.split('/')[0])  # type: ignore

    # Validate args.
    s = Schema({
        str:
        Use(bool),
        '<src-identifier>/<src-file>':
        And(
            str,
            And(And(
                str,
                lambda x: '/' in x,
                error='Destination not formatted correctly. See usage example.'
            ),
                assert_src_file_exists,
                error=
                (f'https://{session.host}/download/{src_path} does not exist. '
                 'Please check the identifier and filepath and retry.'))),
        '<dest-identifier>/<dest-file>':
        And(str,
            lambda x: '/' in x,
            error='Destination not formatted correctly. See usage example.'),
        '--metadata':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--metadata must be formatted as --metadata="key:value"'),
        '--replace-metadata':
        Use(bool),
        '--header':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--header must be formatted as --header="key:value"'),
        '--ignore-file-metadata':
        Use(bool),
    })

    try:
        args = s.validate(args)
    except SchemaError as exc:
        # This module is sometimes called by other modules.
        # Replace references to 'ia copy' in ___doc__ to 'ia {cmd}' for clarity.
        usage = printable_usage(__doc__.replace('ia copy', f'ia {cmd}'))
        print(f'{exc}\n{usage}', file=sys.stderr)
        sys.exit(1)

    args['--header']['x-amz-copy-source'] = f'/{quote(src_path)}'
    # Copy the old metadata verbatim if no additional metadata is supplied,
    # else combine the old and the new metadata in a sensible manner.
    if args['--metadata'] or args['--replace-metadata']:
        args['--header']['x-amz-metadata-directive'] = 'REPLACE'
    else:
        args['--header']['x-amz-metadata-directive'] = 'COPY'

    # New metadata takes precedence over old metadata.
    if not args['--replace-metadata']:
        args['--metadata'] = merge_dictionaries(
            SRC_ITEM.metadata,  # type: ignore
            args['--metadata'])

    # File metadata is copied by default but can be dropped.
    file_metadata = None if args[
        '--ignore-file-metadata'] else SRC_FILE.metadata  # type: ignore

    # Add keep-old-version by default.
    if not args['--header'].get(
            'x-archive-keep-old-version') and not args['--no-backup']:
        args['--header']['x-archive-keep-old-version'] = '1'

    url = f'{session.protocol}//s3.us.archive.org/{quote(dest_path)}'
    queue_derive = True if args['--no-derive'] is False else False
    req = ia.iarequest.S3Request(url=url,
                                 method='PUT',
                                 metadata=args['--metadata'],
                                 file_metadata=file_metadata,
                                 headers=args['--header'],
                                 queue_derive=queue_derive,
                                 access_key=session.access_key,
                                 secret_key=session.secret_key)
    p = req.prepare()
    r = session.send(p)
    if r.status_code != 200:
        try:
            msg = get_s3_xml_text(r.text)
        except Exception as e:
            msg = r.text
        print(f'error: failed to {cmd} "{src_path}" to "{dest_path}" - {msg}',
              file=sys.stderr)
        sys.exit(1)
    elif cmd == 'copy':
        print(f'success: copied "{src_path}" to "{dest_path}".',
              file=sys.stderr)
    return (r, SRC_FILE)  # type: ignore
Esempio n. 12
0
def main(argv, session: ArchiveSession) -> None:
    args = docopt(__doc__, argv=argv)

    # Validation error messages.
    invalid_id_msg = (
        '<identifier> should be between 3 and 80 characters in length, and '
        'can only contain alphanumeric characters, underscores ( _ ), or '
        'dashes ( - )')

    # Validate args.
    s = Schema({
        str:
        Use(bool),
        '<file>':
        list,
        '--format':
        list,
        '--header':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--header must be formatted as --header="key:value"'),
        '--glob':
        list,
        'delete':
        bool,
        '--retries':
        Use(lambda i: int(i[0])),
        '<identifier>':
        str,
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print(f'{exc}\n{printable_usage(__doc__)}', file=sys.stderr)
        sys.exit(1)

    verbose = True if not args['--quiet'] else False
    item = session.get_item(args['<identifier>'])
    if not item.exists:
        print('{0}: skipping, item does\'t exist.', file=sys.stderr)

    # Files that cannot be deleted via S3.
    no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite']

    # Add keep-old-version by default.
    if not args['--header'].get(
            'x-archive-keep-old-version') and not args['--no-backup']:
        args['--header']['x-archive-keep-old-version'] = '1'

    if verbose:
        print(f'Deleting files from {item.identifier}', file=sys.stderr)

    if args['--all']:
        files = list(item.get_files())
        args['--cascade'] = True
    elif args['--glob']:
        files = item.get_files(glob_pattern=args['--glob'])
    elif args['--format']:
        files = item.get_files(formats=args['--format'])
    else:
        fnames = []
        if args['<file>'] == ['-']:
            fnames = [f.strip() for f in sys.stdin]
        else:
            fnames = [f.strip() for f in args['<file>']]

        files = list(item.get_files(fnames))

    if not files:
        print(' warning: no files found, nothing deleted.', file=sys.stderr)
        sys.exit(1)

    errors = False

    for f in files:
        if not f:
            if verbose:
                print(f' error: "{f.name}" does not exist', file=sys.stderr)
            errors = True
        if any(f.name.endswith(s) for s in no_delete):
            continue
        if args['--dry-run']:
            print(f' will delete: {item.identifier}/{f.name}', file=sys.stderr)
            continue
        try:
            resp = f.delete(verbose=verbose,
                            cascade_delete=args['--cascade'],
                            headers=args['--header'],
                            retries=args['--retries'])
        except requests.exceptions.RetryError as e:
            print(f' error: max retries exceeded for {f.name}',
                  file=sys.stderr)
            errors = True
            continue

        if resp.status_code != 204:
            errors = True
            msg = get_s3_xml_text(resp.content)
            print(f' error: {msg} ({resp.status_code})', file=sys.stderr)
            continue

    if errors is True:
        sys.exit(1)