コード例 #1
0
 def get_rate_limit(self, cmd='derive.php'):
     params = {'rate_limits': 1, 'cmd': cmd}
     r = self.make_tasks_request(params)
     line = ''
     tasks = []
     for c in r.iter_content():
         c = c.decode('utf-8')
         if c == '\n':
             j = json.loads(line)
             task = CatalogTask(j, self)
             tasks.append(task)
             line = ''
         line += c
     j = json.loads(line)
     return j
コード例 #2
0
 def insert_test_txt(body):
     body = json.loads(body)
     body['files'].append({
         'name': 'test.txt',
         'md5': 'acbd18db4cc2f85cedef654fccc4a4d8'
     })
     return json.dumps(body)
コード例 #3
0
def test_ia_search_itemlist(capsys):
    test_scrape_response = load_test_data_file('scrape_response.json')

    with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
        url1 = (f'{PROTOCOL}//archive.org/services/search/v1/scrape'
                '?q=collection%3Aattentionkmartshoppers'
                '&count=10000')
        url2 = (f'{PROTOCOL}//archive.org/services/search/v1/scrape?'
                'cursor=W3siaWRlbnRpZmllciI6IjE5NjEtTC0wNTkxNCJ9XQ%3D%3D'
                '&q=collection%3Aattentionkmartshoppers'
                '&count=10000')
        rsps.add(responses.POST,
                 url1,
                 body=test_scrape_response,
                 match_querystring=True)
        _j = json.loads(test_scrape_response)
        del _j['cursor']
        _r = json.dumps(_j)
        rsps.add(responses.POST, url2, body=_r, match_querystring=True)
        ia_call([
            'ia', 'search', 'collection:attentionkmartshoppers', '--itemlist'
        ])

    out, err = capsys.readouterr()
    assert len(set(out.split())) == 100
コード例 #4
0
    def get_tasks(self, identifier=None, params=None):
        """Get a list of all tasks meeting all criteria.
        The list is ordered by submission time.

        :type identifier: str
        :param identifier: (optional) The item identifier, if provided
                           will return tasks for only this item filtered by
                           other criteria provided in params.

        :type params: dict
        :param params: (optional) Query parameters, refer to
                       `Tasks API
                        <https://archive.org/services/docs/api/tasks.html>`_
                       for available parameters.

        :rtype: List[CatalogTask]
        """
        params = params if params else {}
        if identifier:
            params.update({'identifier': identifier})
        params.update({'limit': 0})
        if not params.get('summary'):
            params['summary'] = 0
        r = self.make_tasks_request(params)
        line = ''
        tasks = []
        for c in r.iter_content():
            c = c.decode('utf-8')
            if c == '\n':
                j = json.loads(line)
                task = CatalogTask(j, self)
                tasks.append(task)
                line = ''
            line += c
        if line.strip():
            j = json.loads(line)
            task = CatalogTask(j, self)
            tasks.append(task)

        all_tasks = sorted(tasks, key=sort_by_date, reverse=True)
        return all_tasks
コード例 #5
0
def test_ia_upload_status_check(capsys):
    with IaRequestsMock() as rsps:
        rsps.add(responses.GET,
                 f'{PROTOCOL}//s3.us.archive.org',
                 body=STATUS_CHECK_RESPONSE,
                 content_type='application/json')

        ia_call(['ia', 'upload', 'nasa', '--status-check'])
        out, err = capsys.readouterr()
        assert 'success: nasa is accepting requests.' in err

        j = json.loads(STATUS_CHECK_RESPONSE)
        j['over_limit'] = 1
        rsps.reset()
        rsps.add(responses.GET,
                 f'{PROTOCOL}//s3.us.archive.org',
                 body=json.dumps(j),
                 content_type='application/json')

        ia_call(['ia', 'upload', 'nasa', '--status-check'],
                expected_exit_code=1)
        out, err = capsys.readouterr()
        assert ('warning: nasa is over limit, and not accepting requests. '
                'Expect 503 SlowDown errors.') in err
コード例 #6
0
def nasa_metadata():
    return json.loads(load_test_data_file('metadata/nasa.json'))
コード例 #7
0
def main(argv, session):
    args = docopt(__doc__, argv=argv)
    ERRORS = False

    # Validate args.
    s = Schema({
        str:
        Use(bool),
        '<identifier>':
        Or(
            None,
            And(str,
                validate_s3_identifier,
                error=
                ('<identifier> should be between 3 and 80 characters in length, and '
                 'can only contain alphanumeric characters, periods ".", '
                 'underscores "_", or dashes "-". However, <identifier> cannot begin '
                 'with periods, underscores, or dashes.'))),
        '<file>':
        And(
            And(lambda f: all(os.path.exists(x) for x in f if x != '-'),
                error='<file> should be a readable file or directory.'),
            And(lambda f: False
                if f == ['-'] and not args['--remote-name'] else True,
                error=
                '--remote-name must be provided when uploading from stdin.')),
        '--remote-name':
        Or(None, str),
        '--spreadsheet':
        Or(None,
           os.path.isfile,
           error='--spreadsheet should be a readable file.'),
        '--file-metadata':
        Or(None,
           os.path.isfile,
           error='--file-metadata should be a readable file.'),
        '--metadata':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--metadata must be formatted as --metadata="key:value"'),
        '--header':
        Or(None,
           And(Use(get_args_dict), dict),
           error='--header must be formatted as --header="key:value"'),
        '--retries':
        Use(lambda x: int(x[0]) if x else 0),
        '--sleep':
        Use(lambda l: int(l[0]), error='--sleep value must be an integer.'),
        '--size-hint':
        Or(Use(lambda l: str(l[0]) if l else None),
           int,
           None,
           error='--size-hint value must be an integer.'),
        '--status-check':
        bool,
    })
    try:
        args = s.validate(args)
    except SchemaError as exc:
        print(f'{exc}\n{printable_usage(__doc__)}', file=sys.stderr)
        sys.exit(1)

    # Make sure the collection being uploaded to exists.
    collection_id = args['--metadata'].get('collection')
    if collection_id and not args['--no-collection-check'] and not args[
            '--status-check']:
        if isinstance(collection_id, list):
            collection_id = collection_id[0]
        collection = session.get_item(collection_id)
        if not collection.exists:
            print(
                'You must upload to a collection that exists. '
                f'"{collection_id}" does not exist.\n{printable_usage(__doc__)}',
                file=sys.stderr)
            sys.exit(1)

    # Status check.
    if args['--status-check']:
        if session.s3_is_overloaded():
            print(
                f'warning: {args["<identifier>"]} is over limit, and not accepting requests. '
                'Expect 503 SlowDown errors.',
                file=sys.stderr)
            sys.exit(1)
        else:
            print(f'success: {args["<identifier>"]} is accepting requests.',
                  file=sys.stderr)
            sys.exit()

    elif args['<identifier>']:
        item = session.get_item(args['<identifier>'])

    # Upload keyword arguments.
    if args['--size-hint']:
        args['--header']['x-archive-size-hint'] = args['--size-hint']
    # Upload with backups turned on by default.
    if not args['--header'].get(
            'x-archive-keep-old-version') and not args['--no-backup']:
        args['--header']['x-archive-keep-old-version'] = '1'

    queue_derive = True if args['--no-derive'] is False else False
    verbose = True if args['--quiet'] is False else False

    if args['--file-metadata']:
        try:
            with open(args['--file-metadata']) as fh:
                args['<file>'] = json.load(fh)
        except JSONDecodeError:
            args['<file>'] = []
            with open(args['--file-metadata']) as fh:
                for line in fh:
                    j = json.loads(line.strip())
                    args['<file>'].append(j)
    upload_kwargs = {
        'metadata': args['--metadata'],
        'headers': args['--header'],
        'debug': args['--debug'],
        'queue_derive': queue_derive,
        'verbose': verbose,
        'verify': args['--verify'],
        'checksum': args['--checksum'],
        'retries': args['--retries'],
        'retries_sleep': args['--sleep'],
        'delete': args['--delete'],
        'validate_identifier': True,
    }

    # Upload files.
    if not args['--spreadsheet']:
        if args['-']:
            local_file = TemporaryFile()
            # sys.stdin normally has the buffer attribute which returns bytes.
            # However, this might not always be the case, e.g. on mocking for test purposes.
            # Fall back to reading as str and encoding back to bytes.
            # Note that the encoding attribute might also be None. In that case, fall back to
            # locale.getpreferredencoding, the default of io.TextIOWrapper and open().
            if hasattr(sys.stdin, 'buffer'):

                def read():
                    return sys.stdin.buffer.read(1048576)
            else:
                encoding = sys.stdin.encoding or getpreferredencoding(False)

                def read():
                    return sys.stdin.read(1048576).encode(encoding)

            while True:
                data = read()
                if not data:
                    break
                local_file.write(data)
            local_file.seek(0)
        else:
            local_file = args['<file>']

        if isinstance(local_file,
                      (list, tuple, set)) and args['--remote-name']:
            local_file = local_file[0]
        if args['--remote-name']:
            files = {args['--remote-name']: local_file}
        elif args['--keep-directories']:
            files = {f: f for f in local_file}
        else:
            files = local_file

        for _r in _upload_files(item, files, upload_kwargs):
            if args['--debug']:
                break
            if (not _r.status_code) or (not _r.ok):
                ERRORS = True
            else:
                if args['--open-after-upload']:
                    url = f'{session.protocol}//{session.host}/details/{item.identifier}'
                    webbrowser.open_new_tab(url)

    # Bulk upload using spreadsheet.
    else:
        # Use the same session for each upload request.
        with open(args['--spreadsheet'], 'r', newline='',
                  encoding='utf-8-sig') as csvfp:
            spreadsheet = csv.DictReader(csvfp)
            prev_identifier = None
            for row in spreadsheet:
                for metadata_key in row:
                    if not is_valid_metadata_key(metadata_key):
                        print(
                            f'error: "{metadata_key}" is not a valid metadata key.',
                            file=sys.stderr)
                        sys.exit(1)
                upload_kwargs_copy = deepcopy(upload_kwargs)
                if row.get('REMOTE_NAME'):
                    local_file = {row['REMOTE_NAME']: row['file']}
                    del row['REMOTE_NAME']
                elif args['--keep-directories']:
                    local_file = {row['file']: row['file']}
                else:
                    local_file = row['file']
                identifier = row.get('item', row.get('identifier'))
                if not identifier:
                    if not prev_identifier:
                        print('error: no identifier column on spreadsheet.',
                              file=sys.stderr)
                        sys.exit(1)
                    identifier = prev_identifier
                del row['file']
                if 'identifier' in row:
                    del row['identifier']
                if 'item' in row:
                    del row['item']
                item = session.get_item(identifier)
                # TODO: Clean up how indexed metadata items are coerced
                # into metadata.
                md_args = [f'{k.lower()}:{v}' for (k, v) in row.items() if v]
                metadata = get_args_dict(md_args)
                upload_kwargs_copy['metadata'].update(metadata)
                r = _upload_files(item, local_file, upload_kwargs_copy,
                                  prev_identifier, session)
                for _r in r:
                    if args['--debug']:
                        break
                    if (not _r.status_code) or (not _r.ok):
                        ERRORS = True
                    else:
                        if args['--open-after-upload']:
                            url = f'{session.protocol}//{session.host}/details/{identifier}'
                            webbrowser.open_new_tab(url)
                prev_identifier = identifier

    if ERRORS:
        sys.exit(1)