def test_ia_search_itemlist(capsys): test_scrape_response = load_test_data_file('scrape_response.json') with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: url1 = (f'{PROTOCOL}//archive.org/services/search/v1/scrape' '?q=collection%3Aattentionkmartshoppers' '&count=10000') url2 = (f'{PROTOCOL}//archive.org/services/search/v1/scrape?' 'cursor=W3siaWRlbnRpZmllciI6IjE5NjEtTC0wNTkxNCJ9XQ%3D%3D' '&q=collection%3Aattentionkmartshoppers' '&count=10000') rsps.add(responses.POST, url1, body=test_scrape_response, match_querystring=True) _j = json.loads(test_scrape_response) del _j['cursor'] _r = json.dumps(_j) rsps.add(responses.POST, url2, body=_r, match_querystring=True) ia_call([ 'ia', 'search', 'collection:attentionkmartshoppers', '--itemlist' ]) out, err = capsys.readouterr() assert len(set(out.split())) == 100
def insert_test_txt(body): body = json.loads(body) body['files'].append({ 'name': 'test.txt', 'md5': 'acbd18db4cc2f85cedef654fccc4a4d8' }) return json.dumps(body)
def review(self, title, body, stars=None): u = f'{self.session.protocol}//{self.session.host}/services/reviews.php' p = {'identifier': self.identifier} d = {'title': title, 'body': body} if stars: d['stars'] = stars a = S3Auth(self.session.access_key, self.session.secret_key) r = self.session.post(u, params=p, data=json.dumps(d), auth=a) r.raise_for_status() return r
def __hash__(self): without_excluded_keys = { k: v for k, v in self.item_metadata.items() if k not in self.EXCLUDED_ITEM_METADATA_KEYS } return hash( json.dumps(without_excluded_keys, sort_keys=True, check_circular=False))
def remove_from_simplelist(self, parent, list): """Remove item from a simplelist. :rtype: :class:`requests.Response` """ patch = { 'op': 'delete', 'parent': parent, 'list': list, } data = { '-patch': json.dumps(patch), '-target': 'simplelists', } r = self.session.post(self.urls.metadata, data=data) return r
def test_download_dark_item(tmpdir, capsys, nasa_metadata, session): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: nasa_metadata['metadata']['identifier'] = 'dark-item' nasa_metadata['is_dark'] = True _item_metadata = json.dumps(nasa_metadata) rsps.add(responses.GET, f'{PROTOCOL}//archive.org/metadata/dark-item', body=_item_metadata, content_type='application/json') _item = session.get_item('dark-item') rsps.add(responses.GET, DOWNLOAD_URL_RE, body='no dest dir', status=403, adding_headers={'content-length': '100'}) _item.download(files='nasa_meta.xml', verbose=True) out, err = capsys.readouterr() assert 'skipping dark-item, item is dark' in err
def _prepare_metadata_headers(prepared_metadata, meta_type='meta'): for meta_key, meta_value in prepared_metadata.items(): # Encode arrays into JSON strings because Archive.org does not # yet support complex metadata structures in # <identifier>_meta.xml. if isinstance(meta_value, dict): meta_value = json.dumps(meta_value) # Convert the metadata value into a list if it is not already # iterable. if (isinstance(meta_value, str) or not hasattr(meta_value, '__iter__')): meta_value = [meta_value] # Convert metadata items into HTTP headers and add to # ``headers`` dict. for i, value in enumerate(meta_value): if not value: continue header_key = f'x-archive-{meta_type}{i:02d}-{meta_key}' if (isinstance(value, str) and needs_quote(value)): value = f'uri({quote(value)})' # because rfc822 http headers disallow _ in names, IA-S3 will # translate two hyphens in a row (--) into an underscore (_). header_key = header_key.replace('_', '--') headers[header_key] = value
def test_ia_upload_status_check(capsys): with IaRequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//s3.us.archive.org', body=STATUS_CHECK_RESPONSE, content_type='application/json') ia_call(['ia', 'upload', 'nasa', '--status-check']) out, err = capsys.readouterr() assert 'success: nasa is accepting requests.' in err j = json.loads(STATUS_CHECK_RESPONSE) j['over_limit'] = 1 rsps.reset() rsps.add(responses.GET, f'{PROTOCOL}//s3.us.archive.org', body=json.dumps(j), content_type='application/json') ia_call(['ia', 'upload', 'nasa', '--status-check'], expected_exit_code=1) out, err = capsys.readouterr() assert ('warning: nasa is over limit, and not accepting requests. ' 'Expect 503 SlowDown errors.') in err
def main(argv, session=None): args = docopt(__doc__, argv=argv) # Validate args. s = Schema({ str: Use(bool), '<query>': Use(lambda x: ' '.join(x)), '--parameters': Use(lambda x: get_args_dict(x, query_string=True)), '--header': Or(None, And(Use(get_args_dict), dict), error='--header must be formatted as --header="key:value"'), '--sort': list, '--field': list, '--timeout': Use(lambda x: float(x[0]), error='--timeout must be integer or float.') }) try: args = s.validate(args) except SchemaError as exc: print(f'{exc}\n{printable_usage(__doc__)}', file=sys.stderr) sys.exit(1) # Support comma separated values. fields = list(chain.from_iterable([x.split(',') for x in args['--field']])) sorts = list(chain.from_iterable([x.split(',') for x in args['--sort']])) r_kwargs = { 'headers': args['--header'], 'timeout': args['--timeout'], } search = session.search_items(args['<query>'], fields=fields, sorts=sorts, params=args['--parameters'], full_text_search=args['--fts'], dsl_fts=args['--dsl-fts'], request_kwargs=r_kwargs) try: if args['--num-found']: print(search.num_found) sys.exit(0) for result in search: if args['--itemlist']: print(result.get('identifier', '')) else: j = json.dumps(result) print(j) if result.get('error'): sys.exit(1) except ValueError as e: print(f'error: {e}', file=sys.stderr) except ConnectTimeout as exc: print( 'error: Request timed out. Increase the --timeout and try again.', file=sys.stderr) sys.exit(1) except ReadTimeout as exc: print( 'error: The server timed out and failed to return all search results,' ' please try again', file=sys.stderr) sys.exit(1) except AuthenticationError as exc: print(f'error: {exc}', file=sys.stderr) sys.exit(1)
def json(self): return json.dumps(self.task_dict)
def test_modify_metadata(nasa_item, nasa_metadata): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, f'{PROTOCOL}//archive.org/metadata/nasa') # Test simple add. md = {'foo': 'bar'} p = nasa_item.modify_metadata(md, debug=True) _patch = json.dumps([ { 'add': '/foo', 'value': 'bar' }, ]) expected_data = { 'priority': -5, '-target': 'metadata', '-patch': _patch, } assert set(p.data.keys()) == set(expected_data.keys()) assert p.data['priority'] == expected_data['priority'] assert p.data['-target'] == expected_data['-target'] assert all(v in p.data['-patch'] for v in ['/foo', 'bar']) # Test no changes. md = {'title': 'NASA Images'} p = nasa_item.modify_metadata(md, debug=True) expected_data = {'priority': -5, '-target': 'metadata', '-patch': '[]'} assert p.data == expected_data md = {'title': 'REMOVE_TAG'} p = nasa_item.modify_metadata(md, debug=True) expected_data = { 'priority': -5, '-target': 'metadata', '-patch': json.dumps([{ 'remove': '/title' }]) } assert set(p.data.keys()) == set(expected_data.keys()) assert p.data['priority'] == expected_data['priority'] assert p.data['-target'] == expected_data['-target'] assert '/title' in str(p.data['-patch']) assert 'remove' in str(p.data['-patch']) # Test add array. md = {'subject': ['one', 'two', 'last']} p = nasa_item.modify_metadata(md, debug=True, priority=-1) expected_data = { 'priority': -1, '-target': 'metadata', '-patch': json.dumps([{ 'add': '/subject', 'value': ['one', 'two', 'last'] }]) } assert set(p.data.keys()) == set(expected_data.keys()) assert p.data['priority'] == expected_data['priority'] assert p.data['-target'] == expected_data['-target'] assert '["one", "two", "last"]' in str(p.data['-patch']) \ or '["one","two","last"]' in str(p.data['-patch']) # Test indexed mod. nasa_item.item_metadata['metadata']['subject'] = [ 'first', 'middle', 'last' ] md = {'subject[2]': 'new first'} p = nasa_item.modify_metadata(md, debug=True) expected_data = { 'priority': -5, '-target': 'metadata', '-patch': json.dumps([{ 'value': 'new first', 'replace': '/subject/2' }]) } # Avoid comparing the json strings, because they are not in a canonical form assert set(p.data.keys()) == set(expected_data.keys()) assert all(p.data[k] == expected_data[k] for k in ['priority', '-target']) assert '/subject/2' in p.data['-patch'] or r'\/subject\/2' in p.data[ '-patch'] # Test priority. md = {'title': 'NASA Images'} p = nasa_item.modify_metadata(md, priority=3, debug=True) expected_data = {'priority': 3, '-target': 'metadata', '-patch': '[]'} assert p.data == expected_data # Test auth. md = {'title': 'NASA Images'} p = nasa_item.modify_metadata(md, access_key='a', secret_key='b', debug=True) assert 'access=a' in p.body assert 'secret=b' in p.body # Test change. md = {'title': 'new title'} nasa_metadata['metadata']['title'] = 'new title' _item_metadata = json.dumps(nasa_metadata) rsps.add(responses.GET, f'{PROTOCOL}//archive.org/metadata/nasa', body=_item_metadata) nasa_item.modify_metadata(md, access_key='a', secret_key='b') # Test that item re-initializes assert nasa_item.metadata['title'] == 'new title'
def main(argv, session): args = docopt(__doc__, argv=argv) # Tasks write API. if args['--cmd']: if args['--get-rate-limit']: r = session.get_tasks_api_rate_limit(args['--cmd']) print(json.dumps(r)) sys.exit(0) data = get_args_dict(args['--data'], query_string=True) task_args = get_args_dict(args['--task-args'], query_string=True) data['args'] = task_args r = session.submit_task(args['<identifier>'], args['--cmd'], comment=args['--comment'], priority=data.get('priority'), reduced_priority=args['--reduced-priority'], data=data) j = r.json() if j.get('success'): task_log_url = j.get('value', {}).get('log') print(f'success: {task_log_url}', file=sys.stderr) sys.exit(0) elif 'already queued/running' in j.get('error', ''): print(f'success: {args["--cmd"]} task already queued/running', file=sys.stderr) sys.exit(0) else: print(f'error: {j.get("error")}', file=sys.stderr) sys.exit(1) # Tasks read API. params = get_args_dict(args['--parameter'], query_string=True) if args['<identifier>']: _params = {'identifier': args['<identifier>'], 'catalog': 1, 'history': 1} _params.update(params) params = _params elif args['--get-task-log']: log = session.get_task_log(args['--get-task-log'], params) print(log.encode('utf-8', errors='surrogateescape') .decode('utf-8', errors='replace')) sys.exit(0) queryable_params = [ 'identifier', 'task_id', 'server', 'cmd', 'args', 'submitter', 'priority', 'wait_admin', 'submittime', ] if not (args['<identifier>'] or params.get('task_id')): _params = {'catalog': 1, 'history': 0} _params.update(params) params = _params if not any(x in params for x in queryable_params): _params = {'submitter': session.user_email, 'catalog': 1, 'history': 0, 'summary': 0} _params.update(params) params = _params if args['--tab-output']: warn_msg = ('tab-delimited output will be removed in a future release. ' 'Please switch to the default JSON output.') warnings.warn(warn_msg) for t in session.get_tasks(params=params): # Legacy support for tab-delimted output. if args['--tab-output']: color = t.color if t.color else 'done' task_args = '\t'.join([f'{k}={v}' for k, v in t.args.items()]) output = '\t'.join([str(x) for x in [ t.identifier, t.task_id, t.server, t.submittime, t.cmd, color, t.submitter, task_args, ] if x]) print(output) sys.stdout.flush() else: print(t.json()) sys.stdout.flush()
def prepare_body(self, metadata, source_metadata, target, priority, append, append_list): priority = -5 if not priority else priority if not source_metadata: r = requests.get(self.url) source_metadata = r.json() # Write to many targets if (isinstance(metadata, list) or any('/' in k for k in metadata) or all(isinstance(k, dict) for k in metadata.values())): changes = [] if any(not k for k in metadata): raise ValueError('Invalid metadata provided, ' 'check your input and try again') if target: metadata = {target: metadata} for key in metadata: if key == 'metadata': try: patch = prepare_patch(metadata[key], source_metadata['metadata'], append, append_list) except KeyError: raise ItemLocateError elif key.startswith('files'): patch = prepare_files_patch(metadata[key], source_metadata['files'], append, key, append_list) else: key = key.split('/')[0] patch = prepare_target_patch(metadata, source_metadata, append, target, append_list, key) changes.append({'target': key, 'patch': patch}) self.data = { '-changes': json.dumps(changes), 'priority': priority, } logger.debug(f'submitting metadata request: {self.data}') # Write to single target else: if not target or 'metadata' in target: target = 'metadata' try: patch = prepare_patch(metadata, source_metadata['metadata'], append, append_list) except KeyError: raise ItemLocateError elif 'files' in target: patch = prepare_files_patch(metadata, source_metadata['files'], append, target, append_list) else: metadata = {target: metadata} patch = prepare_target_patch(metadata, source_metadata, append, target, append_list, target) self.data = { '-patch': json.dumps(patch), '-target': target, 'priority': priority, } logger.debug(f'submitting metadata request: {self.data}') super().prepare_body(self.data, None)
def main(argv, session): args = docopt(__doc__, argv=argv) # Validate args. s = Schema({ str: bool, '<identifier>': list, '--modify': list, '--header': Or(None, And(Use(get_args_header_dict), dict), error='--header must be formatted as --header="key:value"'), '--append': list, '--append-list': list, '--remove': list, '--spreadsheet': Or(None, And(lambda f: os.path.exists(f), error='<file> should be a readable file or directory.')), '--target': Or(None, str), '--priority': Or(None, Use(int, error='<priority> should be an integer.')), }) try: args = s.validate(args) except SchemaError as exc: print(f'{exc}\n{printable_usage(__doc__)}', file=sys.stderr) sys.exit(1) formats = set() responses = [] for i, identifier in enumerate(args['<identifier>']): item = session.get_item(identifier) # Check existence of item. if args['--exists']: if item.exists: responses.append(True) print(f'{identifier} exists', file=sys.stderr) else: responses.append(False) print(f'{identifier} does not exist', file=sys.stderr) if (i + 1) == len(args['<identifier>']): if all(r is True for r in responses): sys.exit(0) else: sys.exit(1) # Modify metadata. elif (args['--modify'] or args['--append'] or args['--append-list'] or args['--remove']): if args['--modify']: metadata_args = args['--modify'] elif args['--append']: metadata_args = args['--append'] elif args['--append-list']: metadata_args = args['--append-list'] if args['--remove']: metadata_args = args['--remove'] try: metadata = get_args_dict(metadata_args) if any('/' in k for k in metadata): metadata = get_args_dict_many_write(metadata) except ValueError: print('error: The value of --modify, --remove, --append or --append-list ' 'is invalid. It must be formatted as: --modify=key:value', file=sys.stderr) sys.exit(1) if args['--remove']: responses.append(remove_metadata(item, metadata, args)) else: responses.append(modify_metadata(item, metadata, args)) if (i + 1) == len(args['<identifier>']): if all(r.status_code == 200 for r in responses): sys.exit(0) else: for r in responses: if r.status_code == 200: continue # We still want to exit 0 if the non-200 is a # "no changes to xml" error. elif 'no changes' in r.content.decode('utf-8'): continue else: sys.exit(1) # Get metadata. elif args['--formats']: for f in item.get_files(): formats.add(f.format) if (i + 1) == len(args['<identifier>']): print('\n'.join(formats)) # Dump JSON to stdout. else: metadata = json.dumps(item.item_metadata) print(metadata) # Edit metadata for items in bulk, using a spreadsheet as input. if args['--spreadsheet']: if not args['--priority']: args['--priority'] = -5 with open(args['--spreadsheet'], 'r', newline='', encoding='utf-8') as csvfp: spreadsheet = csv.DictReader(csvfp) responses = [] for row in spreadsheet: if not row['identifier']: continue item = session.get_item(row['identifier']) if row.get('file'): del row['file'] metadata = {k.lower(): v for k, v in row.items() if v} responses.append(modify_metadata(item, metadata, args)) if all(r.status_code == 200 for r in responses): sys.exit(0) else: for r in responses: if r.status_code == 200: continue # We still want to exit 0 if the non-200 is a # "no changes to xml" error. elif 'no changes' in r.content.decode('utf-8'): continue else: sys.exit(1)