def test_search_items_fts(session): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, '{0}//be-api.us.archive.org/ia-pub-fts-api'.format(PROTOCOL), body=TEST_SCRAPE_RESPONSE) rsps.add_metadata_mock('nasa') r = search_items('nina simone', full_text_search=True, archive_session=session) print(r.search_url) assert r.fts == True assert r.dsl_fts == False assert r.query == '!L nina simone' assert r.params == {'count': 10000, 'q': '!L nina simone'} r = search_items('nina simone', full_text_search=True, dsl_fts=True, archive_session=session) assert r.fts == True assert r.dsl_fts == True assert r.query == 'nina simone' assert r.params == {'count': 10000, 'q': 'nina simone'} r = search_items('nina simone', dsl_fts=True, archive_session=session) assert r.fts == True assert r.dsl_fts == True assert r.query == 'nina simone' assert r.params == {'count': 10000, 'q': 'nina simone'}
def main(argv): args = docopt(__doc__, argv=argv) params = dict(p.split(':') for p in args['--parameters']) # format sort paramaters. if args['--sort']: for i, field in enumerate(args['--sort']): key = 'sort[{0}]'.format(i) params[key] = field.strip().replace(':', ' ') query = ' '.join(args['<query>']) if args['--itemlist']: fields = ['identifier'] else: fields = args['--field'] search = search_items(query, fields=args['--field'], params=params, v2=args['--v2']) if args['--number-found']: sys.stdout.write('{0}\n'.format(search.num_found)) sys.exit(0) for result in search: try: if args['--itemlist']: sys.stdout.write(result.get('identifier', '')) else: json.dump(result, sys.stdout) sys.stdout.write('\n') except IOError: sys.exit(0)
def test_search_items(): _j = json.loads(TEST_SEARCH_RESPONSE) _j['response']['numFound'] = 1 _search_r = json.dumps(_j) results_url = ('{0}//archive.org/services/search/beta/scrape.php' '?q=identifier%3Anasa&size=10000'.format(protocol)) count_url = ('{0}//archive.org/services/search/beta/scrape.php' '?q=identifier%3Anasa&total_only=true'.format(protocol)) with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, results_url, body=TEST_SCRAPE_RESPONSE, match_querystring=True, status=200) rsps.add(responses.GET, count_url, body='{"items":[],"count":0,"total":1}', match_querystring=True, content_type='application/json; charset=UTF-8', status=200) r = search_items('identifier:nasa') expected_results = [{'identifier': 'nasa'}] assert r.num_found == 1 assert iter(r).search == r assert len(iter(r)) == 1 assert len(r.iter_as_results()) == 1 assert list(r) == expected_results assert list(r.iter_as_results()) == expected_results
def internet_archive_download(destination_directory, collection='MBLWHOI', pdf_num=None): """ Uses the internetarchive Python package to stream pdf pages from a given collection into a provided destination_directory. """ print('Beginning internet archive download...') for i in internetarchive.search_items('collection:' + collection): if pdf_num is not None: if i == pdf_num: break archive_id = i['identifier'] try: if not os.path.exists(os.path.join(destination_directory, archive_id)): x = internetarchive.download(archive_id, verbose=True, glob_pattern='*.pdf', destdir=destination_directory) elif os.listdir(os.path.join(destination_directory, archive_id)) == []: x = internetarchive.download(archive_id, verbose=True, glob_pattern='*.pdf', destdir=destination_directory) except KeyboardInterrupt: print('Cancelling download.') break except: print('ERROR downloading', archive_id) return
def test_search_items(): _j = json.loads(TEST_SEARCH_RESPONSE) _j['response']['numFound'] = 1 _search_r = json.dumps(_j) results_url = ('{0}//archive.org/services/search/beta/scrape.php' '?q=identifier%3Anasa&size=10000&REQUIRE_AUTH=true'.format(protocol)) count_url = ('{0}//archive.org/services/search/beta/scrape.php' '?q=identifier%3Anasa&total_only=true&REQUIRE_AUTH=true' '&size=10000'.format(protocol)) with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, results_url, body=TEST_SCRAPE_RESPONSE, match_querystring=True, status=200) rsps.add(responses.POST, count_url, body='{"items":[],"count":0,"total":1}', match_querystring=True, content_type='application/json; charset=UTF-8', status=200) r = search_items('identifier:nasa') expected_results = [{'identifier': 'nasa'}] assert r.num_found == 1 assert iter(r).search == r assert len(iter(r)) == 1 assert len(r.iter_as_results()) == 1 assert list(r) == expected_results assert list(r.iter_as_results()) == expected_results
def readdir(self, path, fh): dirents = ['.', '..'] full_path = self._full_path(path) # search IA.. (ls ./nasa => ?query=nasa) for r in search_items(full_path)iter_as_items(): yield r
def get_sponsored_books(): """Performs the `ia` query to fetch sponsored books from archive.org""" # XXX Note: This `search_items` query requires the `ia` tool (the # one installed via virtualenv) to be configured with (scope:all) # privileged s3 keys. items = ia.search_items( 'collection:openlibraryscanningteam', fields=[ 'identifier', 'est_book_price', 'est_scan_price', 'scan_price', 'book_price', 'repub_state', 'imagecount', 'title', 'donor', 'openlibrary_edition', 'publicdate', 'collection', 'isbn', ], params={ 'page': 1, 'rows': 1000 }, config={'general': { 'secure': False }}, ) return [ item for item in items if not (item.get('repub_state') == '-1' and item.get('donor') in BLOCKED_PATRONS) ]
def test_search_items(session): results_url = ( '{0}//archive.org/services/search/v1/scrape' '?q=identifier%3Anasa&count=10000&REQUIRE_AUTH=true'.format(PROTOCOL)) count_url = ('{0}//archive.org/services/search/v1/scrape' '?q=identifier%3Anasa&total_only=true&REQUIRE_AUTH=true' '&count=10000'.format(PROTOCOL)) with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, results_url, body=TEST_SCRAPE_RESPONSE, match_querystring=True) rsps.add(responses.POST, count_url, body='{"items":[],"count":0,"total":1}', match_querystring=True, content_type='application/json; charset=UTF-8') r = search_items('identifier:nasa', archive_session=session) expected_results = [{'identifier': 'nasa'}] assert r.num_found == 1 assert iter(r).search == r assert len(iter(r)) == 1 assert len(r.iter_as_results()) == 1 assert list(r) == expected_results assert list(r.iter_as_results()) == expected_results
def show_all(self, collection=None): collection = self.DEFAULT_COLLECTION search = ia.search_items('collection:' + collection) data = [] for i in search: data.append(i) return data
def queueSearch(search_string): redis_conn = Redis('redis') q = Queue('normal', connection=redis_conn) search = search_items(search_string) for result in search: print result['identifier'] job = q.enqueue(processMetadata_v3.main, result['identifier'])
def get_collection_items(collection_id): search_string = 'collection:' + collection_id search_results = ia.search_items(search_string) item_objects = [] for item in search_results.iter_as_items(): item_objects.append(item) print("Items found in collection: %d" %(len(item_objects))) return item_objects
def append_acct_meta(uploader_email, add_subject): # get a list of all items made by the uploader print("Searching for all items...") search = search_items('uploader:%s' % uploader_email) # add subjects to each item in search results for result in search: append_meta(result['identifier'], add_subject)
def test_search_items_as_items(session): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, '{0}//archive.org/services/search/v1/scrape'.format(PROTOCOL), body=TEST_SCRAPE_RESPONSE) rsps.add_metadata_mock('nasa') r = search_items('identifier:nasa', archive_session=session) assert [x.identifier for x in r.iter_as_items()] == ['nasa'] assert r.iter_as_items().search == r
def lookup_case_insensitive_identifier(identifier): """Perform a case-insensitive lookup""" params = dict(page=1) search_results = internetarchive.search_items('identifier:' + identifier, params=params, config=config) ids = [ r['identifier'] for r in search_results ] if len(ids) > 0: identifier = ids[0] return identifier
def test_search_items_as_items(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, '{0}//archive.org/services/search/v1/scrape'.format(PROTOCOL), body=TEST_SCRAPE_RESPONSE) rsps.add_metadata_mock('nasa') r = search_items('identifier:nasa') assert [x.identifier for x in r.iter_as_items()] == ['nasa'] assert r.iter_as_items().search == r
def lists_ia_collection(collection): ''' :param collection: Name of Collection to search taken from ARGS :return: list, result of IA.api search for collections of given name ''' archive_search = internetarchive.search_items(collection) if (archive_search.num_found < 1): raise Exception("No Collections found named {}".format(collection)) else: return archive_search
def _search_collection(collection_name): """ Searches the internet archive for the specified collection. if no items are found for the collection it returns None otherwise the Search object is returned. """ collection = internetarchive.search_items('collection:{}'.format(collection_name)) if collection.num_found == 0: return None else: return collection
def random_mixtape(): '''Return a random mixtape item''' mixtapes = [] for i in internetarchive.search_items('collection:hiphopmixtapes'): mixtapes.append(i['identifier']) # select a random mixtape mixtape = mixtapes[randint(0, len(mixtapes) - 1)] return mixtape
def random_mixtape(): '''Return a random mixtape item''' mixtapes=[] for i in internetarchive.search_items('collection:hiphopmixtapes'): mixtapes.append(i['identifier']) # select a random mixtape mixtape = mixtapes[randint(0,len(mixtapes) - 1)] return mixtape
def test_search_items_with_fields(): _j = json.loads(TEST_SCRAPE_RESPONSE) _j['items'] = [{'identifier': 'nasa', 'title': 'NASA Images'}] search_response_str = json.dumps(_j) with responses.RequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/services/search/beta/scrape.php'.format( protocol), body=search_response_str, status=200) r = search_items('identifier:nasa', fields=['identifier', 'title']) assert list(r) == [{'identifier': 'nasa', 'title': 'NASA Images'}]
def test_page_row_specification(): _j = json.loads(TEST_SEARCH_RESPONSE) _j["response"]["docs"] = [{"identifier": "nasa"}] _j["response"]["numFound"] = 1 _search_r = json.dumps(_j) with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, "{0}//archive.org/advancedsearch.php".format(protocol), body=_search_r, status=200) rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) r = search_items("identifier:nasa", params={"page": "1", "rows": "1"}) assert [x.identifier for x in r.iter_as_items()] == ["nasa"] assert r.iter_as_items().search == r assert len(r.iter_as_items()) == 1
def test_search_items_as_items(): search_response_str = json.dumps(SEARCH_RESPONSE) with responses.RequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(protocol), body=search_response_str, status=200) rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) r = search_items('identifier:nasa') assert [x.identifier for x in r.iter_as_items()] == ['nasa'] assert r.iter_as_items().search == r
def test_search_items_with_fields(): _j = json.loads(TEST_SCRAPE_RESPONSE) _j['items'] = [ {'identifier': 'nasa', 'title': 'NASA Images'} ] search_response_str = json.dumps(_j) with responses.RequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/services/search/beta/scrape.php'.format(protocol), body=search_response_str, status=200) r = search_items('identifier:nasa', fields=['identifier', 'title']) assert list(r) == [{'identifier': 'nasa', 'title': 'NASA Images'}]
def test_search_items_as_items(): search_response_str = json.dumps(TEST_SCRAPE_RESPONSE) with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add( responses.GET, "{0}//archive.org/services/search/beta/scrape.php".format(protocol), body=TEST_SCRAPE_RESPONSE, status=200, ) rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) r = search_items("identifier:nasa") assert [x.identifier for x in r.iter_as_items()] == ["nasa"] assert r.iter_as_items().search == r
def test_search_items_with_fields(): _j = json.loads(TEST_SCRAPE_RESPONSE) _j["items"] = [{"identifier": "nasa", "title": "NASA Images"}] search_response_str = json.dumps(_j) with responses.RequestsMock() as rsps: rsps.add( responses.GET, "{0}//archive.org/services/search/beta/scrape.php".format(protocol), body=search_response_str, status=200, ) r = search_items("identifier:nasa", fields=["identifier", "title"]) assert list(r) == [{"identifier": "nasa", "title": "NASA Images"}]
def get_book_items(query, rows=100, page=1, scope_all=False): """ :param str query: an search query for selecting/faceting books :param int rows: limit how many results returned :param int page: starting page to offset search results :return: An `internetarchive` Item :rtype: `internetarchive` Item """ params = {'page': page, 'rows': rows} if scope_all: params['scope'] = 'all' # this may need to get run as a session (priv'd access) return ia.search_items(query, params=params).iter_as_items()
def get_collection_ids(self, collection=DEFAULT_COLLECTION, iter_as_items=False): # search idl = [] search = search_items('collection:' + collection) total = search.num_found if iter_as_items: search = search.iter_as_items() print(f'>> [{self.name}] scanning', total, f'items in collection {collection}') # loop for i, result in enumerate(tqdm(search, total=total)): yield result['identifier'] if not iter_as_items else result
def get_metadata(query: str, output: str): for i in search_items(query): id = i["identifier"] if output != None: os.makedirs(output, exist_ok=True) contents = (urllib.request.urlopen( f'https://www.archive.org/metadata/{i["identifier"]}').read(). decode("utf-8")) out_file = os.path.join(output, f"{id}.json") o = open(out_file, "w+") print(f"Writing {out_file}...") o.write(contents) o.close()
def test_search_items_with_fields(): search_r = deepcopy(SEARCH_RESPONSE) search_r['response']['docs'] = [ {'identifier': 'nasa', 'title': 'NASA Images'} ] search_response_str = json.dumps(search_r) with responses.RequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(protocol), body=search_response_str, status=200) r = search_items('identifier:nasa', fields=['identifier', 'title']) assert r.num_found == 1 assert list(r) == [{'identifier': 'nasa', 'title': 'NASA Images'}]
def sync_completed_sponsored_books(dryrun=False): """Retrieves a list of all completed sponsored books from Archive.org so they can be synced with Open Library, which entails: - adding IA ocaid into openlibrary edition - alerting patrons (if possible) by email of completion - possibly marking archive.org item status as complete/synced XXX Note: This `search_items` query requires the `ia` tool (the one installed via virtualenv) to be configured with (scope:all) privileged s3 keys. """ items = ia.search_items( 'collection:openlibraryscanningteam AND collection:inlibrary', fields=['identifier', 'openlibrary_edition'], params={ 'page': 1, 'rows': 1000, 'scope': 'all' }, config={'general': { 'secure': False }}, ) books = web.ctx.site.get_many([ '/books/%s' % i.get('openlibrary_edition') for i in items if i.get('openlibrary_edition') ]) unsynced = [book for book in books if not book.ocaid] ocaid_lookup = { '/books/%s' % i.get('openlibrary_edition'): i.get('identifier') for i in items } fixed = [] for book in unsynced: book.ocaid = ocaid_lookup[book.key] with accounts.RunAs('ImportBot'): if not dryrun: web.ctx.site.save(book.dict(), "Adding ocaid for completed sponsorship") fixed.append({'key': book.key, 'ocaid': book.ocaid}) # TODO: send out an email?... Requires Civi. if book.ocaid.startswith("isbn_"): isbn = book.ocaid.split("_")[-1] sponsorship = get_sponsorship_by_isbn(isbn) contact = sponsorship and sponsorship.get("contact") email = contact and contact.get("email") if not dryrun and email: email_sponsor(email, book) return json.dumps(fixed)
def get_from_ia(reporter, volume): """ Download cases from internet archive via case law and write them to disk. :param reporter: (str) Requires a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) :param volume: (int) Specific volume number of the reporter. If blank function will cycle through all volumes of the reporter on IA. :return: None """ reporter_key = ".".join(['law.free.cap', reporter]) # Checks that the returned reporter is the requested one. # Ex. searching for Mich will return both Mich-app. and Mich. for ia_identifier in search_items(reporter_key): ia_key = ia_identifier['identifier'] if ia_key.split(".")[3] != reporter: continue # Checks if we requested a specific volume of the # reporter and if so skips all other volumes of that reporter ia_volume = ia_key.split(".")[-1] if volume is not None: if volume != ia_volume: continue for item in get_files(ia_key): if "json.json" in item.name: continue if "json" in item.name: url = "https://archive.org/download/%s/%s" % ( ia_key, item.name) file_path = os.path.join(settings.MEDIA_ROOT, 'harvard_corpus', '%s' % ia_key, '%s' % item.name, ) directory = file_path.rsplit("/", 1)[0] if os.path.exists(file_path): logger.info("Already captured: %s", url) continue logger.info("Capturing: %s", url) mkdir_p(directory) data = requests.get(url, timeout=10).json() with open(file_path, 'w') as outfile: json.dump(data, outfile, indent=2)
def test_search_items_as_items(): search_response_str = json.dumps(TEST_SCRAPE_RESPONSE) with responses.RequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/services/search/beta/scrape.php'.format(protocol), body=TEST_SCRAPE_RESPONSE, status=200) rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) r = search_items('identifier:nasa') assert [x.identifier for x in r.iter_as_items()] == ['nasa'] assert r.iter_as_items().search == r
def run_indexing(self): req = internetarchive.search_items('collection:archiveteam_newssites', config_file='account.ini', config={'general':{'secure':False}}) if not req: return r = [s['identifier'] for s in req] for item in r: if item in self.indexed: continue if len(item.split('_')[-1]) >= 14 and item.count('_') == 2: if not item in self.items: self.items[item] = Item(item) self.items[item].run() del self.indexed
def get_sponsored_books(): """Performs the `ia` query to fetch sponsored books from archive.org""" from internetarchive import search_items params = {'page': 1, 'rows': 1000, 'scope': 'all'} fields = ['identifier','est_book_price','est_scan_price', 'scan_price', 'book_price', 'repub_state', 'imagecount', 'title', 'donor', 'openlibrary_edition', 'publicdate', 'collection', 'isbn'] q = 'collection:openlibraryscanningteam' # XXX Note: This `search_items` query requires the `ia` tool (the # one installed via virtualenv) to be configured with (scope:all) # privileged s3 keys. config = {'general': {'secure': False}} return search_items(q, fields=fields, params=params, config=config)
def test_search_items(): search_response_str = json.dumps(SEARCH_RESPONSE) with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(protocol), body=search_response_str, status=200) r = search_items('identifier:nasa') expected_results = [{'identifier': 'nasa'}] assert r.num_found == 1 assert len(r) == 1 assert iter(r).search == r assert len(iter(r)) == 1 assert len(r.iter_as_results()) == 1 assert list(r) == expected_results assert list(r.iter_as_results()) == expected_results
def fetch(): error_log = open(ERRLOG ,"a") errors = 0 collection = sys.argv[1] search = internetarchive.search_items("collection:"+collection) for result in search: itemid = result["identifier"] item = internetarchive.get_item(itemid) try: item.download(destdir=COLLECTIONFOLDER) except Exception as e: error_log.write("Could not download "+itemid+" because of error: %s\n" % e) errors+=1 print "There was an error; writing to log." else: time.sleep(1)
def upload_collection(self): self.logger.info("Searching the Internet Archive...") results = search_items(query='collection:' + self.collection_name, fields=["identifier"]) number_items = results.num_found self.logger.info(str(number_items) + " items found in collection '" + str(self.collection_name) + "'.") metadata = [] self.logger.info("Starting download...") for i,s in enumerate(results): identifier = s.get("identifier") item = get_item(identifier) m = self.get_metadata(item) if not m: continue r = self.download_item(m) p = self.uploader.upload_claim(m) return True
def update_archive_books(): params = {'mediatype': 'texts'} fields = ( 'creator', 'contributor', 'date', 'description', 'genre', 'language', 'name', 'publisher', 'source', 'scanningcenter', 'title', 'subject', 'volume', ) fields = () query = 'language:Telugu and mediatype:texts' query = 'mediatype:texts and languageSorter:Telugu' query = 'languageSorter:Telugu' data = [] try: df = pd.read_csv('data/ia.csv', index_col=['identifier']) except FileNotFoundError: print('Creating new file') df = pd.DataFrame() cdf = df for index, item in enumerate(ia.search_items(query=query)): print(index, item) pk = item['identifier'] if pk in df.index: continue item = ia.get_item(pk) metadata = item.item_metadata.get('metadata', {'item_url': ''}) metadata['item_url'] = item.urls.details print(metadata['item_url']) data.append(metadata) # import ipdb; ipdb.set_trace() if index % 5 == 0: dfo = pd.read_csv('data/ia.csv', index_col=['identifier']) df = pd.DataFrame(data) df.set_index('identifier', inplace=True) df = pd.concat([dfo, df]) df.to_csv('data/ia.csv') df = pd.read_csv('data/ia.csv', index_col=['identifier']) df.drop_duplicates(inplace=True) df.to_csv('data/ia.csv') print(df.shape, len(df)) print('file saved')
def main(argv, session=None): args = docopt(__doc__, argv=argv) # Validate args. s = Schema({ six.text_type: Use(bool), '<query>': Use(lambda x: ' '.join(x)), '--parameters': Use(lambda x: get_args_dict(x, query_string=True)), '--sort': list, '--field': Use(lambda x: ['identifier'] if not x and args['--itemlist'] else x), }) try: args = s.validate(args) except SchemaError as exc: print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr) sys.exit(1) # Support comma separated values. fields = list(chain.from_iterable([x.split(',') for x in args['--field']])) sorts = list(chain.from_iterable([x.split(',') for x in args['--sort']])) search = search_items(args['<query>'], fields=fields, sorts=sorts, params=args['--parameters']) if args['--num-found']: print('{0}'.format(search.num_found)) sys.exit(0) try: for result in search: if args['--itemlist']: print(result.get('identifier', '')) else: j = json.dumps(result) print(j) except ValueError as e: print('error: {0}'.format(e), file=sys.stderr)
def test_page_row_specification(): _j = json.loads(TEST_SEARCH_RESPONSE) _j['response']['docs'] = [{'identifier': 'nasa'}] _j['response']['numFound'] = 1 _search_r = json.dumps(_j) with IaRequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(PROTOCOL), body=_search_r) rsps.add_metadata_mock('nasa') rsps.add(responses.POST, 'https://archive.org/services/search/v1/scrape', body='{"items":[],"count":0,"total":1}', match_querystring=False, content_type='application/json; charset=UTF-8') r = search_items('identifier:nasa', params={ 'page': '1', 'rows': '1'}) assert [x.identifier for x in r.iter_as_items()] == ['nasa'] assert r.iter_as_items().search == r assert len(r.iter_as_items()) == 1
def getDaySubjectsAll(query): """ Attempt to add subjects based on the most frequent n-grams in the items returned by the query """ subjects = {} allsubjects = [] identifiers = [item['identifier'] for item in search_items(query)] with concurrent.futures.ProcessPoolExecutor() as executor: for item, itemsubjects in zip( identifiers, executor.map(getDaySubjects, identifiers)): subjects[item] = itemsubjects for key, value in subjects.items(): allsubjects += value frequentsubjects = [ subject[0] for subject in collections.Counter(allsubjects).most_common(100) ] print("INFO: Would discard the following frequent subjects: {}".format( "; ".join(frequentsubjects)))
def main(): # usage statement if(len(sys.argv) < 3): print("Append new subjects to all Internet Archive items uploaded by the current user.") print("Usage: %s '*****@*****.**' 'subject1;subject2;'" % (sys.argv[0])) sys.exit(1) else: # parameters uploader_email = sys.argv[1] add_subject = sys.argv[2] # get a list of all items made by the uploader print("Searching for all items...") search = search_items('uploader:%s' % uploader_email) # add subjects to each item in search results for result in search: append_meta(result['identifier'], add_subject)
def test_page_row_specification(): _j = json.loads(TEST_SEARCH_RESPONSE) _j['response']['docs'] = [{'identifier': 'nasa'}] _j['response']['numFound'] = 1 _search_r = json.dumps(_j) with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(protocol), body=_search_r, status=200) rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) r = search_items('identifier:nasa', params={'page': '1', 'rows': '1'}) assert [x.identifier for x in r.iter_as_items()] == ['nasa'] assert r.iter_as_items().search == r assert len(r.iter_as_items()) == 1
def find_last_months_dumps_on_ia(yyyy_mm: str = yyyy_mm) -> bool: """ Return True if both ol_dump_yyyy_mm and ol_cdump_yyyy_mm files have been saved on Internet Archive collection:ol_exports. >>> next_month = date.today().replace(day=1) + timedelta(days=31) >>> find_last_months_dumps_on_ia(f"{next_month:%Y-%m}") False """ prefixes = {f"ol_dump_{yyyy_mm}": 0, f"ol_cdump_{yyyy_mm}": 0} # print(prefixes) for item in search_items("collection:ol_exports"): for prefix in prefixes: if item["identifier"].startswith(prefix): prefixes[prefix] += 1 # Is there at least one item id starting with each prefix? if files_with_both_prefixes_found := all(prefixes.values()): return files_with_both_prefixes_found
def test_page_row_specification(session): _j = json.loads(TEST_SEARCH_RESPONSE) _j['response']['items'] = [{'identifier': 'nasa'}] _j['response']['numFound'] = 1 _search_r = json.dumps(_j) with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(PROTOCOL), body=_search_r) rsps.add_metadata_mock('nasa') rsps.add(responses.POST, '{0}//archive.org/services/search/v1/scrape'.format(PROTOCOL), body='{"items":[],"count":0,"total":1}', match_querystring=False, content_type='application/json; charset=UTF-8') r = search_items('identifier:nasa', params={'page': '1', 'rows': '1'}, archive_session=session) assert r.iter_as_items().search == r assert len(r.iter_as_items()) == 1
def query_catalog(args_from_ui): ''' Given the dictonary args_from_ui, query the catalog of radio-aporee-maps at the Internet Archive (archive.org). Inputs: args_from_ui: dict of keywords provided from the ui Returns: query_results: Search object containing results of the query ''' query = 'collection:radio-aporee-maps ' for arg in args_from_ui: keyword = args_from_ui[arg] query = query + arg + ':' + keyword + ' ' query_results = ia.search_items(query) return query_results
def find_films(cls,search_term="collection:(Feature_Films) AND mediatype:(movies)"): """ download films and process """ films = search_items(search_term) films = [x['identifier'] for x in films] random_group = random.sample(films,100) for t in random_group: print t try: fi = cls(t) except Exception: print "failure" fi = None if fi and fi.failed == False: fi.create_gifs()
def test_search_items(): _j = json.loads(TEST_SEARCH_RESPONSE) _j["response"]["numFound"] = 1 _search_r = json.dumps(_j) with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add( responses.GET, "{0}//archive.org/services/search/beta/scrape.php".format(protocol), body=TEST_SCRAPE_RESPONSE, status=200, ) rsps.add(responses.GET, "{0}//archive.org/advancedsearch.php".format(protocol), body=_search_r, status=200) r = search_items("identifier:nasa") expected_results = [{"identifier": "nasa"}] assert r.num_found == 1 assert iter(r).search == r assert len(iter(r)) == 1 assert len(r.iter_as_results()) == 1 assert list(r) == expected_results assert list(r.iter_as_results()) == expected_results
def test_page_row_specification(): _j = json.loads(TEST_SEARCH_RESPONSE) _j['response']['docs'] = [{'identifier': 'nasa'}] _j['response']['numFound'] = 1 _search_r = json.dumps(_j) with responses.RequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/advancedsearch.php'.format(protocol), body=_search_r, status=200) rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) rsps.add(responses.POST, 'https://archive.org/services/search/beta/scrape.php', body='{"items":[],"count":0,"total":1}', match_querystring=False, content_type='application/json; charset=UTF-8', status=200) r = search_items('identifier:nasa', params={ 'page': '1', 'rows': '1'}) assert [x.identifier for x in r.iter_as_items()] == ['nasa'] assert r.iter_as_items().search == r assert len(r.iter_as_items()) == 1
def test_search_items(session): results_url = ('{0}//archive.org/services/search/v1/scrape' '?q=identifier%3Anasa&count=10000&REQUIRE_AUTH=true'.format(PROTOCOL)) count_url = ('{0}//archive.org/services/search/v1/scrape' '?q=identifier%3Anasa&total_only=true&REQUIRE_AUTH=true' '&count=10000'.format(PROTOCOL)) with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, results_url, body=TEST_SCRAPE_RESPONSE, match_querystring=True) rsps.add(responses.POST, count_url, body='{"items":[],"count":0,"total":1}', match_querystring=True, content_type='application/json; charset=UTF-8') r = search_items('identifier:nasa', archive_session=session) expected_results = [{'identifier': 'nasa'}] assert r.num_found == 1 assert iter(r).search == r assert len(iter(r)) == 1 assert len(r.iter_as_results()) == 1 assert list(r) == expected_results assert list(r.iter_as_results()) == expected_results
def main(argv): args = docopt(__doc__, argv=argv) params = dict(p.split(':') for p in args['--parameters']) if args['--sort']: if not isinstance(args['--sort'], list): args['--sort'] = [args['--sort']] for i, field in enumerate(args['--sort']): key = 'sort[{0}]'.format(i) params[key] = field.strip().replace(':', ' ') fields = ['identifier'] + args['--field'] query = ' '.join(args['<query>']) search_resp = search_items(query, fields=fields, params=params) if args['--number-found']: sys.stdout.write('{0}\n'.format(search_resp.num_found)) sys.exit(0) for result in search_resp: output = '\t'.join([result.get(f, '') for f in fields]).encode('utf-8') sys.stdout.write(output + '\n')
def test_search_items_with_fields(session): _j = json.loads(TEST_SCRAPE_RESPONSE) _j['items'] = [ {'identifier': 'nasa', 'title': 'NASA Images'} ] search_response_str = json.dumps(_j) results_url = ('{0}//archive.org/services/search/v1/scrape' '?q=identifier%3Anasa&count=10000&REQUIRE_AUTH=true' '&fields=identifier%2Ctitle'.format(PROTOCOL)) count_url = ('{0}//archive.org/services/search/v1/scrape' '?q=identifier%3Anasa&total_only=true&REQUIRE_AUTH=true' '&count=10000'.format(PROTOCOL)) with IaRequestsMock() as rsps: rsps.add(responses.POST, results_url, match_querystring=True, body=search_response_str) rsps.add(responses.POST, count_url, body='{"items":[],"count":0,"total":1}', match_querystring=True, content_type='application/json; charset=UTF-8') r = search_items('identifier:nasa', fields=['identifier', 'title'], archive_session=session) assert list(r) == [{'identifier': 'nasa', 'title': 'NASA Images'}]
def main(argv, session=None): args = docopt(__doc__, argv=argv) # Validate args. s = Schema({ six.text_type: Use(bool), '<query>': Use(lambda x: ' '.join(x)), '--parameters': Use(lambda x: get_args_dict(x)), '--sort': list, '--field': Use(lambda x: ['identifier'] if not x and args['--itemlist'] else x), }) try: args = s.validate(args) except SchemaError as exc: print('{0}\n{1}'.format(str(exc), printable_usage(__doc__)), file=sys.stderr) sys.exit(1) # Format sort paramaters. for i, field in enumerate(args['--sort']): key = 'sort[{0}]'.format(i) args['--parameters'][key] = field.strip().replace(':', ' ') search = search_items(args['<query>'], fields=args['--field'], params=args['--parameters']) if args['--num-found']: print('{0}'.format(search.num_found)) sys.exit(0) for result in search: if args['--itemlist']: print(result.get('identifier', '')) else: j = json.dumps(result) print(j)
def archive_search(qry): return search_items(qry, config_file=str(basedir / 'ia.ini'))
#!/usr/bin/env python import os import json import ptree from internetarchive import search_items, Item total_bytes = 0 for result in search_items('collection:usda-nurseryandseedcatalog'): id = result['identifier'] item = Item(id) metadata = item.get_metadata() item_dir = os.path.join('items', ptree.id2ptree(id).lstrip("/")) if not os.path.isdir(item_dir): os.makedirs(item_dir) with open(os.path.join(item_dir, 'metadata.json'), 'w') as fh: fh.write(json.dumps(metadata, indent=2)) total_bytes += sum([f.size for f in item.iter_files()]) print item_dir print total_bytes
import time from internetarchive import search_items search = search_items('mediatype:collection AND collection:etree', fields=['identifier','collection']) for result in search: print result['identifier'], result['collection'][0] from rq import Queue from redis import Redis # from processMetadata import ImportShow import time import processMetadata_v3 from internetarchive import search_items # sstring = 'mediatype:etree AND creator:"Blue Turtle Seduction"' sstring = 'mediatype:collection AND collection:etree' search = search_items(sstring) redis_conn = Redis('redis') q = Queue(connection=redis_conn) for result in search: print result['identifier'] job = q.enqueue(processMetadata_v3.main, result['identifier'])
def main(argv, session): args = docopt(__doc__, argv=argv) # Validation error messages. destdir_msg = '--destdir must be a valid path to a directory.' # Validate args. s = Schema({ str: Use(bool), '--destdir': Or([], And(Use(lambda d: d[0]), dir_exists), error=destdir_msg), '--format': list, '--glob': Use(lambda l: l[0] if l else None), '<file>': list, '--search': Or(str, None), '--itemlist': Or(str, None), '<identifier>': Or(str, None), '--retries': Use(lambda x: x[0]), }) # Filenames should be unicode literals. Support PY2 and PY3. if six.PY2: args['<file>'] = [f.decode('utf-8') for f in args['<file>']] try: args = s.validate(args) except SchemaError as exc: sys.stderr.write('{0}\n{1}\n'.format( str(exc), printable_usage(__doc__))) sys.exit(1) retries = int(args['--retries']) if args['--itemlist']: ids = [x.strip() for x in open(args['--itemlist'])] total_ids = len(ids) elif args['--search']: _search = search_items(args['--search']) total_ids = _search.num_found ids = search_ids(args['--search']) # Download specific files. if args['<identifier>']: if '/' in args['<identifier>']: identifier = args['<identifier>'].split('/')[0] files = ['/'.join(args['<identifier>'].split('/')[1:])] else: identifier = args['<identifier>'] files = args['<file>'] total_ids = 1 ids = [identifier] else: files = None errors = list() for i, identifier in enumerate(ids): if total_ids > 1: item_index = '{0}/{1}'.format((i + 1), total_ids) else: item_index = None try: item = session.get_item(identifier) except Exception as exc: print('{0}: failed to retrieve item metadata - errors'.format(identifier)) continue # Otherwise, download the entire item. _errors = item.download( files=files, formats=args['--format'], glob_pattern=args['--glob'], dry_run=args['--dry-run'], verbose=args['--verbose'], silent=args['--silent'], ignore_existing=args['--ignore-existing'], checksum=args['--checksum'], destdir=args['--destdir'], no_directory=args['--no-directories'], retries=retries, item_index=item_index, ignore_errors=True ) if _errors: errors.append(_errors) if errors: # TODO: add option for a summary/report. sys.exit(1) else: sys.exit(0)