def get_rate_limit(self, cmd='derive.php'): params = {'rate_limits': 1, 'cmd': cmd} r = self.make_tasks_request(params) line = '' tasks = [] for c in r.iter_content(): c = c.decode('utf-8') if c == '\n': j = json.loads(line) task = CatalogTask(j, self) tasks.append(task) line = '' line += c j = json.loads(line) return j
def insert_test_txt(body): body = json.loads(body) body['files'].append({ 'name': 'test.txt', 'md5': 'acbd18db4cc2f85cedef654fccc4a4d8' }) return json.dumps(body)
def test_ia_search_itemlist(capsys): test_scrape_response = load_test_data_file('scrape_response.json') with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: url1 = (f'{PROTOCOL}//archive.org/services/search/v1/scrape' '?q=collection%3Aattentionkmartshoppers' '&count=10000') url2 = (f'{PROTOCOL}//archive.org/services/search/v1/scrape?' 'cursor=W3siaWRlbnRpZmllciI6IjE5NjEtTC0wNTkxNCJ9XQ%3D%3D' '&q=collection%3Aattentionkmartshoppers' '&count=10000') rsps.add(responses.POST, url1, body=test_scrape_response, match_querystring=True) _j = json.loads(test_scrape_response) del _j['cursor'] _r = json.dumps(_j) rsps.add(responses.POST, url2, body=_r, match_querystring=True) ia_call([ 'ia', 'search', 'collection:attentionkmartshoppers', '--itemlist' ]) out, err = capsys.readouterr() assert len(set(out.split())) == 100
def get_tasks(self, identifier=None, params=None): """Get a list of all tasks meeting all criteria. The list is ordered by submission time. :type identifier: str :param identifier: (optional) The item identifier, if provided will return tasks for only this item filtered by other criteria provided in params. :type params: dict :param params: (optional) Query parameters, refer to `Tasks API <https://archive.org/services/docs/api/tasks.html>`_ for available parameters. :rtype: List[CatalogTask] """ params = params if params else {} if identifier: params.update({'identifier': identifier}) params.update({'limit': 0}) if not params.get('summary'): params['summary'] = 0 r = self.make_tasks_request(params) line = '' tasks = [] for c in r.iter_content(): c = c.decode('utf-8') if c == '\n': j = json.loads(line) task = CatalogTask(j, self) tasks.append(task) line = '' line += c if line.strip(): j = json.loads(line) task = CatalogTask(j, self) tasks.append(task) all_tasks = sorted(tasks, key=sort_by_date, reverse=True) return all_tasks
def test_ia_upload_status_check(capsys): with IaRequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//s3.us.archive.org', body=STATUS_CHECK_RESPONSE, content_type='application/json') ia_call(['ia', 'upload', 'nasa', '--status-check']) out, err = capsys.readouterr() assert 'success: nasa is accepting requests.' in err j = json.loads(STATUS_CHECK_RESPONSE) j['over_limit'] = 1 rsps.reset() rsps.add(responses.GET, f'{PROTOCOL}//s3.us.archive.org', body=json.dumps(j), content_type='application/json') ia_call(['ia', 'upload', 'nasa', '--status-check'], expected_exit_code=1) out, err = capsys.readouterr() assert ('warning: nasa is over limit, and not accepting requests. ' 'Expect 503 SlowDown errors.') in err
def nasa_metadata(): return json.loads(load_test_data_file('metadata/nasa.json'))
def main(argv, session): args = docopt(__doc__, argv=argv) ERRORS = False # Validate args. s = Schema({ str: Use(bool), '<identifier>': Or( None, And(str, validate_s3_identifier, error= ('<identifier> should be between 3 and 80 characters in length, and ' 'can only contain alphanumeric characters, periods ".", ' 'underscores "_", or dashes "-". However, <identifier> cannot begin ' 'with periods, underscores, or dashes.'))), '<file>': And( And(lambda f: all(os.path.exists(x) for x in f if x != '-'), error='<file> should be a readable file or directory.'), And(lambda f: False if f == ['-'] and not args['--remote-name'] else True, error= '--remote-name must be provided when uploading from stdin.')), '--remote-name': Or(None, str), '--spreadsheet': Or(None, os.path.isfile, error='--spreadsheet should be a readable file.'), '--file-metadata': Or(None, os.path.isfile, error='--file-metadata should be a readable file.'), '--metadata': Or(None, And(Use(get_args_dict), dict), error='--metadata must be formatted as --metadata="key:value"'), '--header': Or(None, And(Use(get_args_dict), dict), error='--header must be formatted as --header="key:value"'), '--retries': Use(lambda x: int(x[0]) if x else 0), '--sleep': Use(lambda l: int(l[0]), error='--sleep value must be an integer.'), '--size-hint': Or(Use(lambda l: str(l[0]) if l else None), int, None, error='--size-hint value must be an integer.'), '--status-check': bool, }) try: args = s.validate(args) except SchemaError as exc: print(f'{exc}\n{printable_usage(__doc__)}', file=sys.stderr) sys.exit(1) # Make sure the collection being uploaded to exists. collection_id = args['--metadata'].get('collection') if collection_id and not args['--no-collection-check'] and not args[ '--status-check']: if isinstance(collection_id, list): collection_id = collection_id[0] collection = session.get_item(collection_id) if not collection.exists: print( 'You must upload to a collection that exists. ' f'"{collection_id}" does not exist.\n{printable_usage(__doc__)}', file=sys.stderr) sys.exit(1) # Status check. if args['--status-check']: if session.s3_is_overloaded(): print( f'warning: {args["<identifier>"]} is over limit, and not accepting requests. ' 'Expect 503 SlowDown errors.', file=sys.stderr) sys.exit(1) else: print(f'success: {args["<identifier>"]} is accepting requests.', file=sys.stderr) sys.exit() elif args['<identifier>']: item = session.get_item(args['<identifier>']) # Upload keyword arguments. if args['--size-hint']: args['--header']['x-archive-size-hint'] = args['--size-hint'] # Upload with backups turned on by default. if not args['--header'].get( 'x-archive-keep-old-version') and not args['--no-backup']: args['--header']['x-archive-keep-old-version'] = '1' queue_derive = True if args['--no-derive'] is False else False verbose = True if args['--quiet'] is False else False if args['--file-metadata']: try: with open(args['--file-metadata']) as fh: args['<file>'] = json.load(fh) except JSONDecodeError: args['<file>'] = [] with open(args['--file-metadata']) as fh: for line in fh: j = json.loads(line.strip()) args['<file>'].append(j) upload_kwargs = { 'metadata': args['--metadata'], 'headers': args['--header'], 'debug': args['--debug'], 'queue_derive': queue_derive, 'verbose': verbose, 'verify': args['--verify'], 'checksum': args['--checksum'], 'retries': args['--retries'], 'retries_sleep': args['--sleep'], 'delete': args['--delete'], 'validate_identifier': True, } # Upload files. if not args['--spreadsheet']: if args['-']: local_file = TemporaryFile() # sys.stdin normally has the buffer attribute which returns bytes. # However, this might not always be the case, e.g. on mocking for test purposes. # Fall back to reading as str and encoding back to bytes. # Note that the encoding attribute might also be None. In that case, fall back to # locale.getpreferredencoding, the default of io.TextIOWrapper and open(). if hasattr(sys.stdin, 'buffer'): def read(): return sys.stdin.buffer.read(1048576) else: encoding = sys.stdin.encoding or getpreferredencoding(False) def read(): return sys.stdin.read(1048576).encode(encoding) while True: data = read() if not data: break local_file.write(data) local_file.seek(0) else: local_file = args['<file>'] if isinstance(local_file, (list, tuple, set)) and args['--remote-name']: local_file = local_file[0] if args['--remote-name']: files = {args['--remote-name']: local_file} elif args['--keep-directories']: files = {f: f for f in local_file} else: files = local_file for _r in _upload_files(item, files, upload_kwargs): if args['--debug']: break if (not _r.status_code) or (not _r.ok): ERRORS = True else: if args['--open-after-upload']: url = f'{session.protocol}//{session.host}/details/{item.identifier}' webbrowser.open_new_tab(url) # Bulk upload using spreadsheet. else: # Use the same session for each upload request. with open(args['--spreadsheet'], 'r', newline='', encoding='utf-8-sig') as csvfp: spreadsheet = csv.DictReader(csvfp) prev_identifier = None for row in spreadsheet: for metadata_key in row: if not is_valid_metadata_key(metadata_key): print( f'error: "{metadata_key}" is not a valid metadata key.', file=sys.stderr) sys.exit(1) upload_kwargs_copy = deepcopy(upload_kwargs) if row.get('REMOTE_NAME'): local_file = {row['REMOTE_NAME']: row['file']} del row['REMOTE_NAME'] elif args['--keep-directories']: local_file = {row['file']: row['file']} else: local_file = row['file'] identifier = row.get('item', row.get('identifier')) if not identifier: if not prev_identifier: print('error: no identifier column on spreadsheet.', file=sys.stderr) sys.exit(1) identifier = prev_identifier del row['file'] if 'identifier' in row: del row['identifier'] if 'item' in row: del row['item'] item = session.get_item(identifier) # TODO: Clean up how indexed metadata items are coerced # into metadata. md_args = [f'{k.lower()}:{v}' for (k, v) in row.items() if v] metadata = get_args_dict(md_args) upload_kwargs_copy['metadata'].update(metadata) r = _upload_files(item, local_file, upload_kwargs_copy, prev_identifier, session) for _r in r: if args['--debug']: break if (not _r.status_code) or (not _r.ok): ERRORS = True else: if args['--open-after-upload']: url = f'{session.protocol}//{session.host}/details/{identifier}' webbrowser.open_new_tab(url) prev_identifier = identifier if ERRORS: sys.exit(1)