def extract_data(records, feed_stem, collection_title): """Extract IA data from API using provided IDs, and compose results into list of records that can be used to generate OPDS. See https://archive.org/services/docs/api/internetarchive/ Args: records (list): list of dicts of form: {'bibid': <bibid>, 'id':<ia_id>, 'label':<link_label>} feed_stem (str): abbreviated label of feed, e.g., 'ia_mrp_feed' collection_title (str): human-readable string, e.g.: "Missionary Research Pamphlets" Returns: dict: form of {'data': <the_output>, 'errors': <the_errors>} """ the_output = [] the_errors = [] # check for duplicate ids and report them (they will be processed anyway) the_ids = [r['id'] for r in records] dupe_ids = find_duplicates(the_ids) dupe_errors = [[ str(datetime.today()), feed_stem, r['bibid'], r['id'], 'Duplicate ID' ] for r in records if r['id'] in dupe_ids] # pprint(dupe_errors) the_errors += dupe_errors for record in records: record_files = get_item(record['id']).files record_metadata = get_item(record['id']).metadata if not record_metadata: # There was no data from the API print('ERROR: No data for ' + record['bibid'] + ' : ' + record['id'] + '! Skipping...') the_errors.append([ str(datetime.today()), feed_stem, record['bibid'], record['id'], 'No data!' ]) continue if all('.pdf' not in f['name'] for f in record_files): # There is data but no PDF derivative to use. print('ERROR: No PDF available for ' + record['bibid'] + ' : ' + record['id'] + '! Skipping...') the_errors.append([ str(datetime.today()), feed_stem, record['bibid'], record['id'], 'No PDF file!' ]) continue # Add the metadata to the output print(record_metadata['identifier'] + ': ' + record_metadata['title']) # Add CUL-specific metadata for use in generating feed XML. record_metadata['cul_metadata'] = { 'bibid': record['bibid'], 'feed_id': feed_stem, 'collection_name': collection_title, 'label': record['label'] } the_output.append(record_metadata) return {'data': the_output, 'errors': the_errors}
def test_get_item_with_kwargs(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) item = get_item("nasa", http_adapter_kwargs={"max_retries": 13}) assert isinstance(item.session.adapters["{0}//".format(protocol)].max_retries, urllib3.Retry) try: item = get_item("nasa", request_kwargs={"timeout": 0.0000000000001}) except Exception as exc: assert "Connection to archive.org timed out" in str(exc)
def test_get_item_with_kwargs(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') item = get_item('nasa', http_adapter_kwargs={'max_retries': 13}) assert isinstance(item.session.adapters['{0}//'.format(PROTOCOL)].max_retries, urllib3.Retry) try: get_item('nasa', request_kwargs={'timeout': .0000000000001}) except Exception as exc: assert 'timed out' in str(exc)
def get_valid_identifier_suffix(library, Id): item = get_item("%s_%s_%s_1" % ('bub', library, Id)) if item.exists == False: item = get_item("%s_%s_%s" % ('bub', library, Id)) if item.exists == False: return Id for index in range(2, 10): item = get_item("%s_%s_%s_%s" % ('bub', library, Id, index)) if item.exists == False: return Id + "_" + str(index) item = get_item(urandom(16).encode("hex")) return item
def get_valid_identifier_suffix(library, Id): item = get_item("%s_%s_%s_1" % ("bub", library, Id)) if item.exists == False: item = get_item("%s_%s_%s" % ("bub", library, Id)) if item.exists == False: return Id for index in range(2, 10): item = get_item("%s_%s_%s_%s" % ("bub", library, Id, index)) if item.exists == False: return Id + "_" + str(index) item = get_item(urandom(16).encode("hex")) return item
def get_valid_identifier(self, primary = True): """Iterate over identifiers suffixed by _<no>, until found.""" item = ia.get_item("%s_%s_%s" %('bub', self.library, self.Id)) if item.exists == False and primary == True: return item for index in range(2,10): item = ia.get_item("%s_%s_%s_%s" %('bub', self.library, self.Id, index)) if item.identifier == self.ia_identifier: continue if item.exists == False: return item item = ia.get_item(urandom(16).encode("hex")) return item
def test_get_item_with_kwargs(): with responses.RequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) item = get_item('nasa', http_adapter_kwargs={'max_retries': 13}) assert isinstance(item.session.adapters['{0}//'.format(protocol)].max_retries, urllib3.Retry) try: item = get_item('nasa', request_kwargs={'timeout': .0000000000001}) except Exception as exc: assert 'Connection to archive.org timed out' in str(exc)
def check_for_new_items(username, password, collection, collections_db): """username->(String) IA username password->(String) IA password collection->(String) IA identifier for the collection to watch collections_db->(String) file path of plain text database of collections to ignore. ident_1 ident_d ident_3 returns->(list) list of identifiers of new items in collection Checks if there is a new group of scans in the collection from a list in a text file""" configure(username, password) # Configure log in information for IA downloaded_collections = [] with open(collections_db) as f: for line in f: downloaded_collections.append(line.rstrip("\n")) new_collections = [] for book in get_item(collection).contents(): if (book.identifier not in downloaded_collections): new_collections.append(book.identifier) return new_collections
def main(): search = internetarchive.search.Search( 'urlteam terroroftinytown -collection:test_collection') for result in search: print(result) item = internetarchive.get_item(result['identifier']) if not item.metadata['subject'] == 'urlteam;terroroftinytown': continue subjects = ['urlteam', 'terroroftinytown', 'archiveteam'] for file_obj in item.iter_files(): if file_obj.name.endswith('.zip'): shortener_id = file_obj.name.split('.', 1)[0] subjects.append(shortener_id) new_subject = ';'.join(subjects) print(new_subject) item.modify_metadata({'subject': new_subject}, access_key=ACCESS_KEY, secret_key=SECRET_KEY)
def validate(): site_id = None args = get_args() if 'site' in args and args.site: site_id = args.site """Check all synced files to make sure their md5 hash matches the hash stored on Archive.org""" synced_session = ytarchive().sessionsGetSyncedOldest(site_id) if synced_session: session_log = ytarchive().logsGetSynced(synced_session.id) validation_time = time.time() - (60 * 60) if session_log and session_log.time < validation_time: archive_info = get_item(synced_session.archive_id) session_files = ytarchive().filesGetSynced(synced_session.id) valid = validate_files(session_files, archive_info) if valid: ytarchive().sessionsUpdate({ 'id': synced_session.id, 'validated': True }) log(synced_session, "Session files validated", c.SESSION_SYNCED) cleanup_files(synced_session) else: ytarchive().sessionsUpdate({ 'id': synced_session.id, 'state': c.SESSION_FAILED, 'validated': False }) log(synced_session, "Session files failed validation", c.SESSION_FAILED, c.LOG_ERROR)
def item_summary(item_id): print("summarizing %s" % item_id) # IA's api can thrown errors so try 10 times before failing tries = 0 while tries < 10: try: item = ia.get_item(item_id) break except Exception as e: print('caught exception: %s' % e) time.sleep(10) tries += 1 size = 0 for file in item.item_metadata.get('files', []): if file['name'].endswith('arc.gz'): size += int(file['size']) m = re.match('^.+-(\d\d\d\d)(\d\d)(\d\d)', item.item_metadata['metadata']['identifier']) date = '%s-%s-%s' % m.groups() if 'metadata' not in item.item_metadata: print('missing metadata %s' % item_id) return None, None return date, size
def uploadTikTok(username, tiktok, deletionStatus, file): regex = re.compile('[0-9]{17}') regexA = re.compile('[0-9]{18}') regexB = re.compile('[0-9]{19}') regexC = re.compile('[0-9]{8}') regexD = re.compile('[0-9]{9}') if (os.path.isdir(tiktok) and (regex.match(str(tiktok)) or (regexA.match(str(tiktok))) or (regexB.match(str(tiktok))) or (regexC.match(str(tiktok))) or (regexD.match(str(tiktok))))): item = get_item('tiktok-' + tiktok) item.upload('./' + tiktok + '/', verbose=True, checksum=True, delete=deletionStatus, metadata=dict(collection='opensource_media', subject='tiktok', creator=username, title='TikTok Video by ' + username, originalurl='https://www.tiktok.com/@' + username + '/video/' + tiktok, scanner='TikUp 2020.07.01'), retries=9001, retries_sleep=60) if (deletionStatus == True): os.rmdir(tiktok) print() print('Uploaded to https://archive.org/details/tiktok-' + tiktok) print() if file != None: file.write(str(tiktok)) file.write('\n')
def sequence(self, book): """ :param [NGramProcessor] pipeline: a list of NGramProcessors that run modules :param [str|ia.Item] book: an Archive.org book Item or Item.identifier :param int rows: limit how many results returned :param int page: starting page to offset search results """ try: sequence_tic = time.perf_counter() sq = self.Sequence(copy.deepcopy(self.pipeline)) sq.book = book if type(book) is ia.Item else ia.get_item(book) if sq.book.exists: for p in sq.pipeline: sq.pipeline[p].run(sq.book) sequence_toc = time.perf_counter() sq.total_time = round(sequence_toc - sequence_tic, 3) return sq else: print(sq.book.identifier + ' - Item cannot be found.') logging.error(sq.book.identifier + ' - Item cannot be found.') except IndexError: print(sq.book.identifier + ' - does not have DjvuXML and/or DjvuTXT to be sequenced!') logging.error( sq.book.identifier + ' - does not have DjvuXML and/or DjvuTXT to be sequenced!') except requests.exceptions.HTTPError: print( sq.book.identifier + ' - DjvuXML and/or DjvuTXT is forbidden and can\'t be sequenced!' ) logging.error( sq.book.identifier + - 'DjvuXML and/or DjvuTXT is forbidden and can\'t be sequenced!' )
def upload_ia_item(self): logger.debug("Uploading IA item for {}".format(self.ia_id)) if not self.has_image and not self.has_crop: logger.debug("No images to upload") return None files = [] if self.has_image: saved_image = self.save_image() files.append(saved_image) if self.has_crop: saved_crop = self.save_crop() files.append(saved_crop) internetarchive.upload( self.ia_id, files, metadata=self.ia_metadata, access_key=settings.IA_ACCESS_KEY_ID, secret_key=settings.IA_SECRET_ACCESS_KEY, checksum=False, verbose=True ) if self.has_image: os.remove(saved_image) if self.has_crop: os.remove(saved_crop) return internetarchive.get_item(self.ia_id)
def check_for_new_items(username,password,collection,collections_db): """username->(String) IA username password->(String) IA password collection->(String) IA identifier for the collection to watch collections_db->(String) file path of plain text database of collections to ignore. ident_1 ident_d ident_3 returns->(list) list of identifiers of new items in collection Checks if there is a new group of scans in the collection from a list in a text file""" configure(username,password) # Configure log in information for IA downloaded_collections = [] with open(collections_db) as f: for line in f: downloaded_collections.append(line.rstrip("\n")) new_collections = [] for book in get_item(collection).contents(): if(book.identifier not in downloaded_collections): new_collections.append(book.identifier) return new_collections
def upload_item(item_dir): all_files = ['{0}/{1}'.format(item_dir, x) for x in os.listdir(item_dir)] # Make sure the item has at the very least a PDF and metadata. required_files = [ '{0}/{0}.pdf'.format(item_dir), '{0}/{0}.json'.format(item_dir) ] for required_file in required_files: assert any(f == required_file for f in all_files) # Parse metadata. json_md = '{0}/{0}.json'.format(item_dir) with open(json_md) as fp: md = json.load(fp) assert 'collection' in md # We don't want to upload the JSON file, remove it from all_files. files = [x for x in all_files if x != '{0}/{0}.json'.format(item_dir)] item = get_item(item_dir) rs = item.upload(files, metadata=md, retries=100, delete=True, checksum=True) if all(r.status_code == 200 for r in rs): with open('uploaded', 'a'): os.utime('uploaded', None) return rs
def main(): parser = argparse.ArgumentParser(description='Correct the title of the entries of a given set of rounds.') parser.add_argument('rounds', metavar='ROUND', nargs='+', help='Round to correct') parser.add_argument('--metadata-file', default='metadata_rnd_1_to_89.csv', help='Path of the metadata file') args = parser.parse_args() # Load metadata reader = csv.DictReader(open(args.metadata_file)) # Iterate over entries of those rounds for d in reader: if d['round'] in args.rounds: place = d['place'] author = d['author'] padded_round = "{0:03d}".format(int(d['round'])) title = d['title'] # Don't try to correct empty entries if author == 'SDCTester': continue root_target_file = 'SDC' + padded_round + '-' \ + padded_place(place) + '_' + author + '_-_' \ + title.replace(' ', '_') root_target_file_decoded = root_target_file.decode('utf8') target_file = 'files/' + root_target_file_decoded + ".flac" new_title = place + ' - ' + root_target_file_decoded md = {'title': new_title} item = get_item('SDCompo_Round_' + padded_round) print "Round {} {}:".format(d['round'], author), title, "->", new_title item.modify_metadata(md, target=target_file)
def upload_to_internet_archive(self, link_guid): # setup asset = Asset.objects.get(link_id=link_guid) link = asset.link identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX+link_guid warc_path = os.path.join(asset.base_storage_path, asset.warc_capture) # create IA item for this capture item = internetarchive.get_item(identifier) metadata = { 'collection':settings.INTERNET_ARCHIVE_COLLECTION, 'mediatype':'web', 'date':link.creation_timestamp, 'title':'Perma Capture %s' % link_guid, 'creator':'Perma.cc', # custom metadata 'submitted_url':link.submitted_url, 'perma_url':"http://%s/%s" % (settings.HOST, link_guid) } # upload with default_storage.open(warc_path, 'rb') as warc_file: success = item.upload(warc_file, metadata=metadata, access_key=settings.INTERNET_ARCHIVE_ACCESS_KEY, secret_key=settings.INTERNET_ARCHIVE_SECRET_KEY, verbose=True, debug=True) if success: print "Succeeded." else: print "Failed." self.retry(exc=Exception("Internet Archive reported upload failure."))
def test_get_item_with_config(): with responses.RequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) item = get_item('nasa', config={'s3': {'access': 'key'}}) assert item.session.access_key == 'key'
def test_get_item(): with responses.RequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) item = get_item('nasa') assert item.identifier == 'nasa'
def main(argv): args = docopt(__doc__, argv=argv) verbose = args['--verbose'] item = get_item(args['<identifier>']) # Files that cannot be deleted via S3. no_delete = ['_meta.xml', '_files.xml', '_meta.sqlite'] if verbose: sys.stdout.write('Deleting files from {0}\n'.format(item.identifier)) if args['--all']: files = [f for f in item.iter_files()] args['--cacade'] = True else: files = [item.get_file(f) for f in args['<file>']] for f in files: if not f: if verbose: sys.stderr.write(' error: "{0}" does not exist\n'.format(f.name)) sys.exit(1) if any(f.name.endswith(s) for s in no_delete): continue resp = f.delete(verbose=args['--verbose'], cascade_delete=args['--cascade']) if resp.status_code != 204: error = parseString(resp.content) msg = get_xml_text(error.getElementsByTagName('Message')) sys.stderr.write(' error: {0} ({1})\n'.format(msg, resp.status_code)) sys.exit(1)
def __init__(self,ident): """ set up directory info - download from archive if necessary """ self.ident = ident self.dir = os.path.join(Film.root_dir,ident) self.clip_dir = os.path.join(self.dir,"clips") self.failed = False if os.path.isdir(self.dir) == False: item = get_item(ident) ogg = [x['name'] for x in item.files if ".ogv" in x['name']] meta = [x['name'] for x in item.files if "_meta.xml" in x['name']] if ogg and meta: ogg = ogg[:1] meta = meta[:1] os.makedirs(self.dir) os.makedirs(self.clip_dir) download(ident,files=ogg+meta,destdir=Film.root_dir, verbose=True) else: self.failed = True if self.failed == False: self.ogv = [x for x in os.listdir(self.dir) if ".ogv" in x] self.meta = [x for x in os.listdir(self.dir) if "_meta.xml" in x] if self.ogv and self.meta: self.ogv = self.ogv[0] self.meta =self.meta[0] self.load_meta() else: self.failed = True
def append_meta(identifier, add_subject): # obtain existing metadata for given item item = get_item(identifier) subject = item.metadata['subject'] # if subjects are given as a list, convert to semicolon-separated list if isinstance(subject, list): l = "" for element in subject: l += "%s;" % element subject = l # append new subject to existing subject if str(subject).endswith(';'): new_subject = str(subject) + add_subject else: new_subject = str(subject) + ';' + add_subject # upload new metadata r = item.modify_metadata(dict(subject=new_subject)) # check if metadata successfully modified if (r.status_code == 200): print(":: [Identifier] Item: [%s] %s" % (identifier, item.metadata['title'])) print("Subjects '%s' successfully appended." % add_subject) print("Result: %s" % new_subject) else: print("Failed to add new subjects.")
def mk_mirror(target): '''Make the mirror''' session = ArchiveSession() target = 'collection:' + target print("Attempting to download collection: " + target) search = ia.Search(session, target) ## Because the internetarchive module won't return us a list ## we'll have to make our own. current_item = 1 total_item = 0 collection = [] for entry in search: collection.append(entry) total_item += 1 ## Go through all items of the collection and download for entry in collection: item_id = entry['identifier'] print('Downloading ' + str(current_item) + '/' + str(total_item) + '\t'\ + item_id) item = get_item(item_id) status = item.download() print('\t\t Download successful') current_item += 1
def getFileList(self): """ This function is used to get the list of files in an item and excludes the default files that are present in all Internet Archive items. Returns: List of files in the item excluding default files in alphabetical order. False if an error has occurred. """ tries = 0 while tries < self.retries: try: iaitem = internetarchive.get_item(identifier=self.identifier) break except Exception as exception: self.handleException(exception=exception) if tries == self.retries: return False else: tries += 1 time.sleep(60*tries) filelist = [] for thefile in iaitem.files: filename = thefile['name'] if filename in self.defaultFiles: continue else: filelist.append(filename) return sorted(filelist)
def download_item(identifier): ''' Download the mp3 file associated with identifier from the catalog. Inputs: identifier: str, identifier for an Item object Returns: ''' item = ia.get_item(identifier) f_name = '' # This loop finds the .mp3 associated with the audio file. for f in item.iter_files(): if f.name[-4:] == '.mp3': f_name = f.name break assert f_name != '', 'No .mp3 file associated with item {}.\ Try a different item'.format(identifier) f = item.get_file(f_name) if f.size <= MAX_SIZE: f.download(SOUND_DIR + f_name) return f.name else: print('File size is', f.size, 'bytes') print('File size exceeds', MAX_SIZE, 'bytes') return None
def __init__(self, archive_id, metadata=None, config_file_path=None, repo_base=None): """ :param archive_id: :param config_file_path: :param repo_base: In archive item, place each file in a folder mirroring its local location. """ self.repo_base = repo_base self.archive_id = archive_id self.archive_session = internetarchive.get_session( config_file=config_file_path) self.archive_item = internetarchive.get_item( archive_id, config_file=config_file_path) self.metadata = metadata logging.info(self.archive_item.identifier) self.original_item_files = list( filter( lambda x: x["source"] == "original" and not x["name"]. startswith(self.archive_item.identifier) and not x[ "name"].startswith("_"), self.archive_item.files)) self.original_item_file_names = sorted( map(lambda x: x["name"], self.original_item_files))
def _upload_files(args, identifier, local_file, upload_kwargs): verbose = True if args['--quiet'] is False else False config = {} if not args['--log'] else {'logging': {'level': 'INFO'}} item = get_item(identifier, config=config) if verbose: sys.stdout.write('{0}:\n'.format(item.identifier)) response = item.upload(local_file, **upload_kwargs) if args['--debug']: for i, r in enumerate(response): if i != 0: sys.stdout.write('---\n') headers = '\n'.join( [' {0}: {1}'.format(k, v) for (k, v) in r.headers.items()] ) sys.stdout.write('Endpoint:\n {0}\n\n'.format(r.url)) sys.stdout.write('HTTP Headers:\n{0}\n'.format(headers)) else: for resp in response: if not resp: continue if resp.status_code == 200: continue error = parseString(resp.content) code = get_xml_text(error.getElementsByTagName('Code')) msg = get_xml_text(error.getElementsByTagName('Message')) sys.stderr.write( 'error "{0}" ({1}): {2}\n'.format(code, resp.status_code, msg) ) sys.exit(1)
def upload_single_show_to_internetarchive(show_info: RefinedShow): show_title = f"Hooting Yard On The Air: {show_info.title()}" upload_id = f"{IA_PRFX}_{show_info.id}" log.info(f"Attempting to upload {show_info.id}, Title: {show_title}") show_text = show_info.get_title_and_text() show_toc = show_info.get_toc() md = { "collection": "hooting-yard", "description": show_toc, "mediatype": "audio", "title": show_title, "creator": "Frank Key", "date": show_info.tx_date().isoformat(), "notes": show_text, } log.info(f"Metadata: {pprint.pformat(md)}") try: item: Item = get_item(upload_id) log.info(f"Found an item: {item}") item.modify_metadata(metadata=md) except internetarchive.exceptions.ItemLocateError: r = upload( identifier=upload_id, files=[show_info.get_audio_file().path], metadata=md, verbose=True, ) assert r[0].status_code == 200 log.info(f"Completed upload {show_info.id}") return upload_id
def main(): search = internetarchive.search.Search('urlteam terroroftinytown -collection:test_collection') for result in search: print(result) item = internetarchive.get_item(result['identifier']) if not item.metadata['subject'] == 'urlteam;terroroftinytown': continue subjects = ['urlteam', 'terroroftinytown', 'archiveteam'] for file_obj in item.iter_files(): if file_obj.name.endswith('.zip'): shortener_id = file_obj.name.split('.', 1)[0] subjects.append(shortener_id) new_subject = ';'.join(subjects) print(new_subject) item.modify_metadata( {'subject': new_subject}, access_key=ACCESS_KEY, secret_key=SECRET_KEY )
def main(argv): args = docopt(__doc__, argv=argv) item = get_item(args['<identifier>']) # Check existence of item. if args['--exists']: if item.exists: sys.stdout.write('{0} exists\n'.format(item.identifier)) sys.exit(0) else: sys.stderr.write('{0} does not exist\n'.format(item.identifier)) sys.exit(1) # Modify metadata. elif args['--modify'] or args['--append']: append = True if args['--append'] else False metadata_args = args['--modify'] if args['--modify'] else args['--append'] metadata = get_args_dict(metadata_args) response = modify_metadata(args['<identifier>'], metadata, append=append) if not response.json()['success']: error_msg = response.json()['error'] sys.stderr.write('error: {0} ({1})\n'.format(error_msg, response.status_code)) sys.exit(1) sys.stdout.write('success: {0}\n'.format(response.json()['log'])) # Get metadata. elif args['--formats']: formats = set([f.format for f in item.iter_files()]) sys.stdout.write('\n'.join(formats) + '\n') else: metadata = dumps(item.metadata) sys.stdout.write(metadata + '\n') sys.exit(0)
def test_upload(self): s = self.start_ia_session() item = get_item('opencontext-test-item', archive_session=s, debug=True) r = item.upload( 'https://artiraq.org/static/opencontext/abydos-looting/full/fig001.jpg' ) return r
def get_ia_item(self, identifier): try: item = get_item(identifier, archive_session = self.session) except Exception as e: self.logger.warn('Could not get item %s. Error %s' , identifier, e) item = None return item
def validate(self): """Validate the form.""" initial_validation = super().validate() if not initial_validation: return False url, track_id = canonify_track_url(self.url.data) try: item = get_item(track_id, request_kwargs={'timeout': 30}) metadata = item.item_metadata.get('metadata') if not metadata: raise ValueError("'%s' not found." % track_id) except Exception as exc: self.add_form_error( "Could not get meta data from Archive.org: %s" % exc) else: if metadata.get( 'title', '').strip().lower() != self.title.data.strip().lower(): self.add_form_error( "Title does not match title in Archive.org meta data.") if metadata.get( 'creator', '').strip().lower() != self.artist.data.strip().lower(): self.add_form_error("Artist does not match creator / author " "in Archive.org meta data.") flac = None for file in getattr(item, 'files', []): if file.get('format') == 'Flac': flac = file break else: self.add_form_error("Track not available in FLAC format.") if flac: try: length = float(flac.get('length', 0)) except (TypeError, ValueError): length = 0 min_length = current_app.config.get('MIN_TRACK_LENGTH', 60.0) max_length = current_app.config.get('MAX_TRACK_LENGTH', 300.0) if length < min_length: self.add_form_error( "Track does not have minimum required duration (%s min.)." % format_duration(min_length)) elif length > max_length: self.add_form_error( "Track exceeds maximum allowed duration (%s min.)." % format_duration(max_length)) else: self.add_form_error( "Missing meta data for FLAC download of track.") return not self.errors.get('form')
def backup_report(ig, year, report_id, options=None): if options is None: options = {} logging.warn("") # this had better be there report = json.load(open(metadata_path(ig, year, report_id))) if report.get("unreleased"): logging.warn("[%s][%s][%s] Unreleased report, skipping." % (ig, year, report_id)) return True if already_uploaded(ig, year, report_id) and (options.get("force") is not True): logging.warn("[%s][%s][%s] Already backed up, skipping." % (ig, year, report_id)) return True logging.warn("[%s][%s][%s] Initializing item." % (ig, year, report_id)) item_id = item_id_for(ig, year, report_id) item = internetarchive.get_item(item_id) if item.exists and (options.get("force") is not True): logging.warn("[%s][%s][%s] Ooooops, item does exist. Marking as done, and stopping." % (ig, year, report_id)) mark_as_uploaded(ig, year, report_id) return True metadata = collection_metadata() metadata.update(item_metadata(report)) # 1) add the metadata file, and attach the IA item metadata to it logging.warn("[%s][%s][%s] Sending metadata!" % (ig, year, report_id)) success = upload_files(item, metadata_path(ig, year, report_id), metadata, options) if not success: logging.warn("[%s][%s][%s] :( Error sending metadata." % (ig, year, report_id)) return False # 2) Unless --meta is on, upload the associated report files. if not options.get("meta"): report_path = file_path(ig, year, report_id, report["file_type"]) text_path = file_path(ig, year, report_id, "txt") to_upload = [] if os.path.exists(report_path): to_upload.append(report_path) if (report_path != text_path) and os.path.exists(text_path): to_upload.append(text_path) if len(to_upload) > 0: logging.warn("[%s][%s][%s] Sending %i report files!" % (ig, year, report_id, len(to_upload))) success = upload_files(item, to_upload, None, options) if not success: logging.warn("[%s][%s][%s] :( Error uploading report itself." % (ig, year, report_id)) return False logging.warn("[%s][%s][%s] :) Uploaded:\n%s" % (ig, year, report_id, ia_url_for(item_id))) mark_as_uploaded(ig, year, report_id) return True
def test_get_item_with_config_file(tmpdir, nasa_mocker): tmpdir.chdir() test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) item = get_item('nasa', config_file='ia_test.ini') assert item.session.access_key == 'key2'
def upload_to_ia(): item = get_item(WARC_FILE) md = dict(mediatype='warc', creator='PhantomWARC') cdxmd = dict(mediatype='cdx', creator='PhantomWARC') item.upload(WARC_NAME, metadata=md, access_key=os.environ['IAS3_ACCESS_KEY'], secret_key=os.environ['IAS3_SECRET_KEY']) item.upload(CDX_NAME, metadata=cdxmd, access_key=os.environ['IAS3_ACCESS_KEY'], secret_key=os.environ['IAS3_SECRET_KEY']) IAURL = "https://archive.org/details/%s" % WARC_FILE print "WARC and CDX files uploaded to the Internet Archive as %s" % IAURL
def test_internet_archive(): from datetime import timedelta from django.utils import timezone import internetarchive from perma.models import Link from django.template.defaultfilters import truncatechars start_date = timezone.now() - timedelta(days=3) end_date = timezone.now() - timedelta(days=2) links = Link.objects.filter( internet_archive_upload_status="completed", creation_timestamp__range=(start_date, end_date) ) guid_results = dict() all_results = dict() c = {"s3": {"access": settings.INTERNET_ARCHIVE_ACCESS_KEY, "secret": settings.INTERNET_ARCHIVE_SECRET_KEY}} internetarchive.get_session(config=c) for link in links: identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link.guid item = internetarchive.get_item(identifier) warc_name = "%s.warc.gz" % link.guid try: fnames = [f.name for f in internetarchive.get_files(identifier, glob_pattern="*gz")] guid_results["uploaded_file"] = warc_name in fnames if settings.INTERNET_ARCHIVE_COLLECTION == "test_collection": guid_results["collection"] = item.metadata["collection"] == settings.INTERNET_ARCHIVE_COLLECTION else: guid_results["collection"] = item.metadata["collection"][0] == settings.INTERNET_ARCHIVE_COLLECTION guid_results["title"] = item.metadata["title"] == "%s: %s" % ( link.guid, truncatechars(link.submitted_title, 50), ) guid_results["mediatype"] = item.metadata["mediatype"] == "web" guid_results["description"] = item.metadata["description"] == "Perma.cc archive of %s created on %s." % ( link.submitted_url, link.creation_timestamp, ) guid_results["contributor"] = item.metadata["contributor"] == "Perma.cc" guid_results["submitted_url"] = item.metadata["submitted_url"] == link.submitted_url guid_results["perma_url"] = item.metadata["perma_url"] == "http://%s/%s" % (settings.HOST, link.guid) guid_results["external-identifier"] = item.metadata["external-identifier"] == "urn:X-perma:%s" % link.guid if link.organization: guid_results["organization"] = item.metadata["sponsor"] == "%s - %s" % ( link.organization, link.organization.registrar, ) except Exception as e: guid_results["error"] = e pass all_results[link.guid] = guid_results print all_results
def main(argv): args = docopt(__doc__, argv=argv) # Download specific files. if '/' in args['<identifier>']: identifier = args['<identifier>'].split('/')[0] files = [identifier.split('/')[1:]] else: identifier = args['<identifier>'] files = args['<file>'] item = get_item(identifier) if (args['--quiet'] is False) and (args['--dry-run'] is False): verbose = True else: verbose = False if files: if verbose: sys.stdout.write('{0}:\n'.format(identifier)) for f in files: fname = f.encode('utf-8') if args['--no-directories']: path = fname else: path = os.path.join(identifier, fname) f = item.get_file(fname) if not f: sys.stderr.write(' {} doesn\'t exist!\n'.format(fname)) continue if args['--dry-run']: sys.stdout.write(f.url + '\n') else: f.download(path, verbose, args['--ignore-existing'], args['--checksum'], args['--destdir']) sys.exit(0) # Otherwise, download the entire item. if args['--source']: ia_source = args['--source'] elif args['--original']: ia_source = ['original'] else: ia_source = None item.download( concurrent=args['--concurrent'], source=ia_source, formats=args['--format'], glob_pattern=args['--glob'], dry_run=args['--dry-run'], verbose=verbose, ignore_existing=args['--ignore-existing'], checksum=args['--checksum'], destdir=args['--destdir'], no_directory=args['--no-directories'], ) sys.exit(0)
def test_get_item_with_config_file(tmpdir): tmpdir.chdir() test_conf = """[s3]\naccess = key2""" with open("ia_test.ini", "w") as fh: fh.write(test_conf) with responses.RequestsMock() as rsps: rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body=ITEM_METADATA, status=200) item = get_item("nasa", config_file="ia_test.ini") assert item.session.access_key == "key2"
def _upload_files(args, identifier, local_file, upload_kwargs, prev_identifier=None): verbose = True if args['--quiet'] is False else False config = {} if not args['--log'] else {'logging': {'level': 'INFO'}} item = get_item(identifier, config=config) if (verbose) and (prev_identifier != identifier): sys.stdout.write('{0}:\n'.format(item.identifier)) try: if args['--remote-name']: files = {args['--remote-name']: local_file} else: files = local_file response = item.upload(files, **upload_kwargs) except HTTPError as exc: response = [exc.response] if not response[0]: sys.exit(1) if response[0].status_code == 403: if (not item.session.access_key) and (not item.session.secret_key): sys.stderr.write('\nIAS3 Authentication failed. Please set your IAS3 ' 'access key and secret key \nvia the environment ' 'variables `IAS3_ACCESS_KEY` and `IAS3_SECRET_KEY`, ' 'or \nrun `ia configure` to add your IAS3 keys to your ' 'ia config file. You can \nobtain your IAS3 keys at the ' 'following URL:\n\n\t' 'https://archive.org/account/s3.php\n\n') else: sys.stderr.write('\nIAS3 Authentication failed. It appears the keyset ' '"{0}:{1}" \ndoes not have permission to upload ' 'to the given item or ' 'collection.\n\n'.format(item.session.access_key, item.session.secret_key)) sys.exit(1) if args['--debug']: for i, r in enumerate(response): if i != 0: sys.stdout.write('---\n') headers = '\n'.join( [' {0}: {1}'.format(k, v) for (k, v) in r.headers.items()] ) sys.stdout.write('Endpoint:\n {0}\n\n'.format(r.url)) sys.stdout.write('HTTP Headers:\n{0}\n'.format(headers)) else: for resp in response: if not resp: continue if (resp.status_code == 200) or (not resp.status_code): continue error = parseString(resp.content) code = get_xml_text(error.getElementsByTagName('Code')) msg = get_xml_text(error.getElementsByTagName('Message')) sys.stderr.write( 'error "{0}" ({1}): {2}\n'.format(code, resp.status_code, msg) ) sys.exit(1)
def format_output(query_results): ''' Generate a tuple of lists from search_results. Inputs: query_results: Search object containing results from a query Returns: output: tuple ''' items = [] item_fields = [] attribute_fields = ['ID', 'Title', 'Creator', 'Description'] # restrict query results to MAX_NUM_RESULTS i = MAX_NUM_RESULTS for result in query_results: if i<= 0: break else: identifier = result['identifier'] item = ia.get_item(identifier) items.append(item) i += -1 for item in items: keys = item.metadata.keys() identifier = item.metadata['identifier'] # not all items have a key for each of the following # so there is a need for checking if 'title' in keys: title = item.metadata['title'] else: title = '' if 'creator' in keys: creator = item.metadata['creator'] else: creator = '' if 'description' in keys: description = item.metadata['description'] # remove html tag for line breaks description = re.split('<br />', description) description = " ".join(description) else: description = '' item_fields.append([identifier, title, creator, description]) output = (attribute_fields, item_fields) return output
def check_if_ia_item_exists(infodict): itemname = sanitize_identifier('%s-%s' % (infodict['extractor'], infodict['display_id'])) item = internetarchive.get_item(itemname) if item.exists and self.verbose: print("\n:: Item already exists. Not downloading.") print('Title: %s' % infodict['title']) print('Video URL: %s\n' % infodict['webpage_url']) return 1 return 0
def main(argv): args = docopt(__doc__, argv=argv) item = get_item(args['<identifier>']) # Check existence of item. if args['--exists']: if item.exists: stdout.write('{0} exists\n'.format(item.identifier)) exit(0) else: stderr.write('{0} does not exist\n'.format(item.identifier)) exit(1) # Modify metadata. elif args['--modify']: metadata = get_args_dict(args['--modify']) response = modify_metadata(args['<identifier>'], metadata) status_code = response['status_code'] if not response['content']['success']: error_msg = response['content']['error'] stderr.write('error: {0} ({1})\n'.format(error_msg, status_code)) exit(1) stdout.write('success: {0}\n'.format(response['content']['log'])) # Get metadata. elif args['--files']: for i, f in enumerate(item.files()): if not args['--target']: files_md = [f.identifier, f.name, f.source, f.format, f.size, f.md5] else: files_md = [f.__dict__.get(k) for k in args['--target']] stdout.write('\t'.join([str(x) for x in files_md]) + '\n') elif args['--formats']: formats = set([f.format for f in item.files()]) stdout.write('\n'.join(formats) + '\n') elif args['--target']: metadata = [] for key in args['--target']: if '/' in key: for i, k in enumerate(key.split('/')): if i == 0: md = item.metadata.get(k) else: if md: md = md.get(k) else: md = item.metadata.get(key) if md: metadata.append(md) stdout.write('\t'.join([str(x) for x in metadata]) + '\n') else: metadata = dumps(item.metadata) stdout.write(metadata + '\n') exit(0)
def delete_imagepdf(self, item, abby_filegz): head, abby_file = os.path.split(abby_filegz) pdffile = re.sub('_abbyy.gz$', '.pdf', abby_file) itemobj = internetarchive.get_item(item) fileobj = internetarchive.File(itemobj, pdffile) if fileobj and fileobj.source == 'derivative' and \ fileobj.format == 'Image Container PDF': fileobj.delete(access_key = self.access_key, headers= self.headers,\ secret_key = self.secret_key) self.logger.warn('Old image pdf exists in %s. Deleted it', item)
def save_items(): subdirs = [d for d in next(os.walk(AUDIO_DIRS))[1]] num_subdirs = len(subdirs) items = {} for i, id in enumerate(subdirs): print str(i) + '/' + str(num_subdirs), id item = ia.get_item(id) items[id] = {} items[id]['metadata'] = item.metadata items[id]['files'] = item.files write_json(items, SBD_ITEMS)
def main(argv): args = docopt(__doc__, argv=argv) verbose = True if args['--quiet'] is False else False if verbose is not False: sys.stdout.write('getting item: {0}\n'.format(args['<identifier>'])) headers = get_args_dict(args['--header']) if args['--size-hint']: headers['x-archive-size-hint'] = args['--size-hint'] upload_kwargs = dict( metadata=get_args_dict(args['--metadata']), headers=headers, debug=args['--debug'], queue_derive=True if args['--no-derive'] is False else False, ignore_preexisting_bucket=args['--ignore-bucket'], verbose=verbose, delete=args['--delete']) # Upload stdin. if args['<file>'] == ['-'] and not args['-']: sys.stderr.write('--remote-name is required when uploading from stdin.\n') call(['ia', 'upload', '--help']) sys.exit(1) if args['-']: local_file = TemporaryFile() local_file.write(sys.stdin.read()) local_file.seek(0) upload_kwargs['key'] = args['--remote-name'] # Upload files. else: local_file = args['<file>'] config = {} if not args['--log'] else {'logging': {'level': 'INFO'}} item = get_item(args['<identifier>'], config=config) response = item.upload(local_file, **upload_kwargs) if args['--debug']: for i, r in enumerate(response): if i != 0: sys.stdout.write('---\n') headers = '\n'.join([' {0}: {1}'.format(k, v) for (k, v) in r.headers.items()]) sys.stdout.write('Endpoint:\n {0}\n\n'.format(r.url)) sys.stdout.write('HTTP Headers:\n{0}\n'.format(headers)) else: for resp in response: if resp.status_code == 200: continue error = parseString(resp.content) code = get_xml_text(error.getElementsByTagName('Code')) msg = get_xml_text(error.getElementsByTagName('Message')) sys.stderr.write('error "{0}" ({1}): {2}\n'.format(code, resp.status_code, msg)) sys.exit(1)
def get_item_metadata(identifier): ''' :param identifier: corresponds to 'identifier' on <class 'internetarchive.item.Item'> :return: metadata (dict) from a single item in the collection ''' item = '' try: item = internetarchive.get_item(identifier) except Exception: pass return item.metadata
def test_get_item_with_config_file(tmpdir): tmpdir.chdir() test_conf = """[s3]\naccess = key2""" with open('ia_test.ini', 'w') as fh: fh.write(test_conf) with responses.RequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) item = get_item('nasa', config_file='ia_test.ini') assert item.session.access_key == 'key2'
def getattr(self, path, fh=None): full_path = self._full_path(path) st = get_item(full_path) return { 'st_atime': st['created'], 'st_ctime': st['created'], 'st_mtime': st['updated'], 'st_size': st['item_size'], 'st_gid': 0, 'st_uid': 0 }
def __init__(self, item_id, dst_dir, metadata): self.item_id = item_id self.item = get_item(item_id) # create IA item self.dst_dir = dst_dir self.metadata = metadata # three strikes and we're out self.timeout = 3 # IA S3 API keys self.access_key = None self.secret_key = None
def _upload_files(args, identifier, local_file, upload_kwargs): verbose = True if args['--quiet'] is False else False config = {} if not args['--log'] else {'logging': {'level': 'INFO'}} item = get_item(identifier, config=config) if verbose: sys.stdout.write('{0}:\n'.format(item.identifier)) try: response = item.upload(local_file, **upload_kwargs) except HTTPError as exc: response = [exc.response] if not response[0]: sys.exit(1) if response[0].status_code == 403: if (not item.session.access_key) and (not item.session.secret_key): sys.stderr.write('\nIAS3 Authentication failed. Please set your IAS3 ' 'access key and secret key \nvia the environment ' 'variables `IAS3_ACCESS_KEY` and `IAS3_SECRET_KEY`, ' 'or \nrun `ia configure` to add your IAS3 keys to your ' 'ia config file. You can \nobtain your IAS3 keys at the ' 'following URL:\n\n\t' 'https://archive.org/account/s3.php\n\n') else: sys.stderr.write('\nIAS3 Authentication failed. It appears the keyset ' '"{0}:{1}" \ndoes not have permission to upload ' 'to the given item or ' 'collection.\n\n'.format(item.session.access_key, item.session.secret_key)) sys.exit(1) if args['--debug']: for i, r in enumerate(response): if i != 0: sys.stdout.write('---\n') headers = '\n'.join( [' {0}: {1}'.format(k, v) for (k, v) in r.headers.items()] ) sys.stdout.write('Endpoint:\n {0}\n\n'.format(r.url)) sys.stdout.write('HTTP Headers:\n{0}\n'.format(headers)) else: for resp in response: if not resp: continue if resp.status_code == 200: continue error = parseString(resp.content) code = get_xml_text(error.getElementsByTagName('Code')) msg = get_xml_text(error.getElementsByTagName('Message')) sys.stderr.write( 'error "{0}" ({1}): {2}\n'.format(code, resp.status_code, msg) ) sys.exit(1)