def main(args): with open("basho.json") as fp: basho = json.loads(fp.read()) _id = basho[args.basho]["archive"] _dir = os.path.join(args.basho, '') # ensure trailing slash desc = basho[args.basho]["description"] + basho["description"] item = {'collection': 'honbasho', 'mediatype': 'movies', 'description': desc} print _id, print pretty(item) if args.upload: print "=" * 72 print "upload(%s, %s, metadata=%s)" % (_id, _dir, item) if raw_input("Continue with upload? (y/N): ") == 'y': upload(_id, _dir, metadata=item, verbose=True) meta = get_metadata(_id, _dir) update_metadata(_id, meta) if args.metadata: print "=" * 72 if raw_input("Update metadata? (y/N): ") == 'y': update_metadata(_id, meta, for_real=True)
def test_upload_validate_identifier(): try: upload('føø', NASA_METADATA_PATH, access_key='test_access', secret_key='test_secret', validate_identifier=True) assert False except Exception as exc: assert isinstance(exc, InvalidIdentifierException) expected_s3_headers = { 'content-length': '7557', 'x-archive-queue-derive': '1', 'x-archive-meta00-scanner': 'uri(Internet%20Archive%20Python%20library', 'x-archive-size-hint': '7557', 'x-archive-auto-make-bucket': '1', 'authorization': 'LOW test_access:test_secret', } with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, re.compile(r'.*s3.us.archive.org/.*'), adding_headers=expected_s3_headers) rsps.add_metadata_mock('nasa') rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(PROTOCOL), body='{}') upload('nasa', NASA_METADATA_PATH, access_key='test_access', secret_key='test_secret', validate_identifier=True) assert True
def upload_single(self, name, file, ia_args): internetarchive.upload('archiveteam_newssites_{name}'.format(name=name), os.path.join(settings.dir_ready, file), metadata = ia_args, access_key = settings.access_key, secret_key = settings.secret_key, queue_derive = True, verify = True, verbose = True, delete = True, retries = 10, retries_sleep = 300)
def ia_upload(self, identifier, metadata, to_upload, files, rawfile): success = False try: if metadata: r = upload(identifier, to_upload, metadata = metadata, \ access_key = self.access_key, \ secret_key = self.secret_key, \ retries=100) else: r = upload(identifier, to_upload, \ access_key = self.access_key, \ secret_key = self.secret_key, \ retries=100) success = True except HTTPError as e: self.logger.warn('Error in upload for %s: %s', identifier, e) msg = '%s' % e if re.search('Syntax error detected in pdf data', msg) or \ re.search('error checking pdf file', msg): r = self.upload_bad_pdf(identifier, rawfile, files) success = True except Exception as e: self.logger.warn('Error in upload for %s: %s', identifier, e) success = False return success
def write_results_to_item(self, itemid, filename='results.json'): with tempfile.NamedTemporaryFile() as tmp: json.dump(self.results, tmp) ia.upload( itemid, {'%s_%s' % (self.book.identifier, filename): tmp}, access_key=S3_KEYS.get('access'), secret_key=S3_KEYS.get('secret'))
def main(args): with open("basho.json") as fp: basho = json.loads(fp.read()) _id = basho[args.basho]["archive"] _dir = os.path.join(args.basho, '') # ensure trailing slash desc = basho[args.basho]["description"] + basho["description"] item = { 'collection': 'honbasho', 'mediatype': 'movies', 'description': desc } print _id, print pretty(item) if args.upload: print "=" * 72 print "upload(%s, %s, metadata=%s)" % (_id, _dir, item) if raw_input("Continue with upload? (y/N): ") == 'y': upload(_id, _dir, metadata=item, verbose=True) meta = get_metadata(_id, _dir) update_metadata(_id, meta) if args.metadata: print "=" * 72 if raw_input("Update metadata? (y/N): ") == 'y': update_metadata(_id, meta, for_real=True)
def upload_ia_item(self): logger.debug("Uploading IA item for {}".format(self.ia_id)) if not self.has_image and not self.has_crop: logger.debug("No images to upload") return None files = [] if self.has_image: saved_image = self.save_image() files.append(saved_image) if self.has_crop: saved_crop = self.save_crop() files.append(saved_crop) internetarchive.upload( self.ia_id, files, metadata=self.ia_metadata, access_key=settings.IA_ACCESS_KEY_ID, secret_key=settings.IA_SECRET_ACCESS_KEY, checksum=False, verbose=True ) if self.has_image: os.remove(saved_image) if self.has_crop: os.remove(saved_crop) return internetarchive.get_item(self.ia_id)
def upload_file(self, identifier, filepath): try: upload(identifier, [filepath], access_key = self.access_key, \ secret_key = self.secret_key) except Exception as e: self.logger.warn('Error in upload for %s: %s', filepath, e) return False return True
def archivewikipdf(wikilang='', project='', pagetitle=''): if re.search(r'(?im)[^a-z0-9_ ]', pagetitle): print('-'*30) print("Error unsupported title", pagetitle) return langword = '' if wikilang in langs.keys(): langword = langs[wikilang] else: print("Error unknown lang", wikilang) return projectucfirst = project[0].upper()+project[1:] pagetitle_ = re.sub(' ', '_', pagetitle) print('\n', '-'*30, '\n', wikilang, pagetitle) pdfurl = 'https://%s.%s.org/api/rest_v1/page/pdf/%s' % (wikilang, project, pagetitle_) dateiso = datetime.datetime.now().isoformat().split('T')[0] dateiso2 = re.sub('-', '', dateiso) pdfname = '%s%s-%s-%s.pdf' % (wikilang, projects[project], pagetitle_, dateiso2) originalurl = 'https://%s.%s.org/wiki/%s' % (wikilang, project, pagetitle_) itemid = pdfname itemurl = 'https://archive.org/details/' + itemid itemhtml = getURL(url=itemurl, retry=False) if itemhtml and not 'Item cannot be found' in itemhtml: print('Skiping. Item exists', itemurl) return try: os.system('wget "%s" -O "%s"' % (pdfurl, pdfname)) if os.path.exists(pdfname) and os.path.getsize(pdfname) < 1: print("Error generating PDF") os.remove(pdfname) return except: print("Error generating PDF") return md = { 'mediatype': 'texts', 'creator': projectucfirst, 'licenseurl': 'https://creativecommons.org/licenses/by-sa/3.0/', 'language': langword, 'genre': genres[project], 'date': dateiso, 'year': dateiso[:4], 'description': '%s page.' % (projectucfirst), 'subject': '%s; offline; pdf; page; mediawiki; %s; %s; %s; %s%s; %s' % (project.lower(), dateiso, wikilang, langword, wikilang, projects[project], pagetitle), 'originalurl': originalurl, } try: internetarchive.upload(itemid, pdfname, metadata=md, queue_derive=False) print('Uploaded to https://archive.org/details/%s' % (itemid)) except: print("Error uploading file to", itemid) if pdfname and '.pdf' in pdfname and os.path.exists(pdfname): os.remove(pdfname)
def upload(self, itemid=None): if getattr(self, 'book'): itemid = itemid or self.book.identifier with tempfile.NamedTemporaryFile() as tmp: tmp.write(json.dumps(self.results).encode()) tmp.flush() ia.upload(itemid, {'%s_genome.json' % (itemid): tmp}, access_key=s3_keys['access'], secret_key=s3_keys['secret'])
def _store_in_thread(self, file): file.seek(0) upload( self.identifier, files={self.filename: file}, metadata=self.metadata, access_key=self.username, secret_key=self.password, )
def process_upload(identifier, filename, fileobj, metadata): yield "Start upload to {0}...\n".format(identifier) internetarchive.upload(identifier, files={filename: fileobj}, metadata=metadata, checksum=False, verify=False, access_key=app.config['IA_ACCESS_KEY'], secret_key=app.config['IA_SECRET_KEY']) yield "Done."
def upload_single(self, name, file, ia_args): internetarchive.upload( 'archiveteam_newssites_{name}'.format(name=name), os.path.join(settings.dir_ready, file), metadata=ia_args, access_key=settings.access_key, secret_key=settings.secret_key, queue_derive=True, verify=True, verbose=True, delete=True, retries=10, retries_sleep=300)
def archive_create(archive_id, files, metadata, session, session_files, archive_item_created=False): success = True for filepath in files: active_file = get_file_from_filepath(filepath, session_files) file_uploaded = True try: if not archive_item_created: r = upload(archive_id, [filepath], metadata) else: r = upload(archive_id, [filepath]) except requests.exceptions.HTTPError as err: file_uploaded = False update_file_status(active_file, c.FILE_FAILED) log(active_file, 'Failed to upload file to archive.org: ' + str(err), c.FILE_FAILED, c.LOG_ERROR) if active_file.type == 'video': success = False log(session, "Failed to add new session to archive.org: " + str(err), c.SESSION_FAILED, c.LOG_ERROR) return success if r[0].status_code != 200 or not file_uploaded: log(active_file, "Failed to upload file to archive.org: " + r[0].reason, c.FILE_FAILED, c.LOG_ERROR) if active_file.type == 'video': success = False log(session, "Failed to add new session to archive.org: " + r[0].reason, c.SESSION_FAILED, c.LOG_ERROR) return success else: archive_item_created = True log(active_file, "File uploaded to archive.org", c.FILE_SYNCED) update_file_status(active_file, c.FILE_SYNCED) if success: update_session_status(session, c.SESSION_SYNCED, archive_id) update_remote_archive_id(session.id, archive_id) log(session, "Session added to archive.org", c.SESSION_SYNCED) return success
def assert_title_page(identifier, titlepage, silent=False): tp = str(titlepage) result = list() # first download scandata.xml file from the item try: item = ia.get_item(identifier) except: raise ('IA identifier not found.') scandata = identifier + '_scandata.xml' for f in item.files: if f['name'] == scandata: ia.download(identifier, files=scandata, silent=silent) with open(os.path.join(identifier, scandata), 'r') as fh: xml = fh.read() nochange = True match = False final = list() for line in xml.splitlines(): newline = line if 'leafNum' in line: # like: <page leafNum="0"> leafnum = line.split('"')[1] if leafnum == tp: match = True if 'pageType' in line: # like: <pageType>Normal</pageType> if match is True: if 'Title' in line: result.append( 'Title page is already declared.') else: newline = line.replace('Normal', 'Title') nochange = False match = False # don't match in the rest of this document elif 'Title' in line: # erroneous title page from IA newline = line.replace('Title', 'Normal') nochange = False final.append(newline) if nochange is True: result.append('No changes detected.') else: with open(os.path.join(identifier, scandata), 'w') as fh: fh.write('\n'.join(final)) result.append( 'Generated new scandata.xml file and uploading...') ia.upload(identifier, files=[os.path.join(identifier, scandata)]) result.append('Success!') rmtree(identifier) return '\n'.join(result)
def upload_doi(doi=None): m = s.get('https://api.crossref.org/works/{}[email protected]'.format(doi)).json()['message'] md = { "collection": "opensource", "licenseurl": "https://creativecommons.org/publicdomain/mark/1.0/", "mediatype": "texts", "subject": "journals", "identifier-doi": doi, "external-identifier": [doi]+(m.get('alternative-id') or []), "originalurl": "https://doi.org/{}".format(doi), "source": "https://api.crossref.org/works/{}".format(doi), "article-type": m.get('type'), "creator": "; ".join([' '.join([a.get('given', ''), a.get('family', '')]) for a in m.get('author', [])]), "date": "-".join([str(d).zfill(2) for d in m.get('published-print', []).get('date-parts', [])[0]]), "description": m.get('abstract', '') + '<hr>\nThis paper is in the public domain in USA. Metadata comes from the CrossRef API, see full record in the source URL below.'.format(doi, doi), "isbn": "; ".join(m.get('ISBN', [])), "issn": "; ".join(m.get('ISSN', [])), "journalabbrv": m.get('short-container-title'), "journaltitle": ' '.join(m.get('container-title', [])), "language": convertlang.get(m.get('language'), m.get('language')), "pagerange": m.get('page'), "publisher": m.get('publisher'), "publisher_location": m.get('publisher-location'), "title": m.get('title')[0], "volume": m.get('issue') } identifier = 'paper-doi-' + re.sub('[^-_A-Za-z0-9]', '_', doi)[:89] r = upload(identifier, files={identifier+'.pdf': quote_plus(doi)+'.pdf'}, metadata=md)
def uploadToArchiveOrg(archive_org_config, file_path, row_data, templates, archivePrefix): MEDIA_TYPE = "movies" COLLECTION = "toplap" LICENSE_URL = "https://creativecommons.org/licenses/by-nc-sa/4.0/" HTML_WRAPPER_PRE = '<span style="font-family:Roboto, Noto, sans-serif;font-size:15px;white-space:pre-wrap;">' HTML_WRAPPER_POST = '</span>' description = HTML_WRAPPER_PRE + templateReplaceDesc(templates, row_data) + HTML_WRAPPER_POST description = description.replace(" -- ", "<br /><br />") file_id = archivePrefix + row_data['archive_id'] try: meta_data = dict(mediatype=MEDIA_TYPE, collection=COLLECTION, creator=row_data['artist_name'], date=row_data['performance_date'], description=description, licenseurl=LICENSE_URL, subject=row_data['tags'].split(","), title=templateReplaceTitle(templates, row_data)) print(f'uploading file: {file_id}') result = upload(file_id, files=[file_path], metadata=meta_data, verbose=True) print(f'completed uploading file: {file_id}') except Exception as e: print(f'An error occurred: {e}') pass
def uploadToArchive1(metadata): identifier = metadata['identifier'] flog = codecs.open('../logs/uploadLog1.txt', 'a', 'utf-8') if len(identifier) > 100: print('File name too long: ' + identifier) flog.write('File name too long: ' + identifier + '\n----------\n') else: accession = metadata['Accession_No'] acc = re.sub('^SDPB', '', accession) acc = re.sub('([ABCDEFGHI])$', '-\g<1>', acc) startMessage = accession + '#' + identifier + '\n' + 'Started at ' + str( datetime.datetime.now()) print(startMessage) flog.write(startMessage + '\n') r = upload(identifier, { identifier + '.pdf': '../../ChunilalGandhiMSS/compressedPdfFiles/S.D.P.B._NO.' + acc + '.pdf' }, metadata=metadata) endMessage = str(r[0].status_code) + '\n' + 'Ended at ' + str( datetime.datetime.now()) + '\n----------\n' print(endMessage) flog.write(endMessage) flog.close()
def archive_audio(obs_id): obs = Observation.objects.get(id=obs_id) suffix = '-{0}'.format(settings.ENVIRONMENT) if settings.ENVIRONMENT == 'production': suffix = '' identifier = 'satnogs{0}-observation-{1}'.format(suffix, obs.id) ogg = obs.payload.path filename = obs.payload.name.split('/')[-1] site = Site.objects.get_current() description = ( '<p>Audio file from SatNOGS{0} <a href="{1}/observations/{2}">' 'Observation {3}</a>.</p>').format(suffix, site.domain, obs.id, obs.id) md = dict(collection=settings.ARCHIVE_COLLECTION, title=identifier, mediatype='audio', licenseurl='http://creativecommons.org/licenses/by-sa/4.0/', description=description) try: res = upload(identifier, files=[ogg], metadata=md, access_key=settings.S3_ACCESS_KEY, secret_key=settings.S3_SECRET_KEY) except (ReadTimeout, HTTPError): return if res[0].status_code == 200: obs.archived = True obs.archive_url = '{0}{1}/{2}'.format(settings.ARCHIVE_URL, identifier, filename) obs.archive_identifier = identifier obs.save() obs.payload.delete()
def test_upload(): expected_s3_headers = { 'content-length': '7557', 'x-archive-queue-derive': '1', 'x-archive-meta00-scanner': 'uri(Internet%20Archive%20Python%20library', 'x-archive-size-hint': '7557', 'content-md5': '6f1834f5c70c0eabf93dea675ccf90c4', 'x-archive-auto-make-bucket': '1', 'authorization': 'LOW test_access:test_secret', } with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, re.compile(r'.*s3.us.archive.org/.*'), adding_headers=expected_s3_headers) rsps.add_metadata_mock('nasa') rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(PROTOCOL), body='{}') _responses = upload('nasa', NASA_METADATA_PATH, access_key='test_access', secret_key='test_secret') for response in _responses: req = response.request headers = dict((k.lower(), str(v)) for k, v in req.headers.items()) scanner_header = '%20'.join( response.headers['x-archive-meta00-scanner'].split('%20')[:4]) headers['x-archive-meta00-scanner'] = scanner_header assert headers == expected_s3_headers assert req.url == '{0}//s3.us.archive.org/nasa/nasa.json'.format(PROTOCOL)
def test_upload(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: expected_s3_headers = { 'content-length': '7557', 'x-archive-queue-derive': '1', 'x-archive-meta00-scanner': 'uri(Internet%20Archive%20Python%20library', 'x-archive-size-hint': '7557', 'content-md5': '6f1834f5c70c0eabf93dea675ccf90c4', 'x-archive-auto-make-bucket': '1', 'authorization': 'LOW test_access:test_secret', } rsps.add(responses.PUT, re.compile(r'.*s3.us.archive.org/.*'), adding_headers=expected_s3_headers, status=200) rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body={}, status=200) resp = upload('nasa', TEST_JSON_FILE, debug=True, access_key='test_access', secret_key='test_secret') for r in resp: p = r.prepare() headers = dict((k.lower(), str(v)) for k, v in p.headers.items()) scanner_header = '%20'.join( r.headers['x-archive-meta00-scanner'].split('%20')[:4]) headers['x-archive-meta00-scanner'] = scanner_header assert headers == expected_s3_headers assert p.url == '{0}//s3.us.archive.org/nasa/nasa_meta.json'.format(protocol)
def upload_abbyy(self, ia_item, abby_filelist): metadata = {'ocr': 'google-cloud-vision IndianKanoon 1.0', \ 'fts-ignore-ingestion-lang-filter': 'true'} abby_files_gz = [] for abby_file in abby_filelist: abby_file_gz, n = re.subn('xml$', 'gz', abby_file) self.delete_imagepdf(ia_item, abby_file_gz) compress_abbyy(abby_file, abby_file_gz) abby_files_gz.append(abby_file_gz) self.update_metadata(ia_item, metadata) success = False while not success: try: success = upload(ia_item, abby_files_gz, headers=self.headers,\ access_key = self.access_key, \ secret_key = self.secret_key, retries=100) success = True except Exception as e: self.logger.warn('Error in upload for %s: %s', ia_item, e) success = False time.sleep(120) return success
def ul(fn): print('this might take a couple mins... uploading %s' % fn) os.rename(fn, 'ul.m4a') md = dict(title=fn[:-16], mediatype='audio') h = str(random.getrandbits(128)) r = upload(h, files={fn: 'ul.m4a'}, metadata=md) return h
def test_upload(): expected_s3_headers = { 'content-length': '7557', 'x-archive-queue-derive': '1', 'x-archive-meta00-scanner': 'uri(Internet%20Archive%20Python%20library', 'x-archive-size-hint': '7557', 'content-md5': '6f1834f5c70c0eabf93dea675ccf90c4', 'x-archive-auto-make-bucket': '1', 'authorization': 'LOW test_access:test_secret', } with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, re.compile(r'.*s3.us.archive.org/.*'), adding_headers=expected_s3_headers) rsps.add_metadata_mock('nasa') rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(PROTOCOL), body='{}') _responses = upload('nasa', NASA_METADATA_PATH, access_key='test_access', secret_key='test_secret') for response in _responses: req = response.request headers = dict((k.lower(), str(v)) for k, v in req.headers.items()) scanner_header = '%20'.join( response.headers['x-archive-meta00-scanner'].split('%20')[:4]) headers['x-archive-meta00-scanner'] = scanner_header assert 'user-agent' in headers del headers['accept'] del headers['accept-encoding'] del headers['connection'] del headers['user-agent'] assert headers == expected_s3_headers assert req.url == '{0}//s3.us.archive.org/nasa/nasa.json'.format(PROTOCOL)
def upload_single_show_to_internetarchive(show_info: RefinedShow): show_title = f"Hooting Yard On The Air: {show_info.title()}" upload_id = f"{IA_PRFX}_{show_info.id}" log.info(f"Attempting to upload {show_info.id}, Title: {show_title}") show_text = show_info.get_title_and_text() show_toc = show_info.get_toc() md = { "collection": "hooting-yard", "description": show_toc, "mediatype": "audio", "title": show_title, "creator": "Frank Key", "date": show_info.tx_date().isoformat(), "notes": show_text, } log.info(f"Metadata: {pprint.pformat(md)}") try: item: Item = get_item(upload_id) log.info(f"Found an item: {item}") item.modify_metadata(metadata=md) except internetarchive.exceptions.ItemLocateError: r = upload( identifier=upload_id, files=[show_info.get_audio_file().path], metadata=md, verbose=True, ) assert r[0].status_code == 200 log.info(f"Completed upload {show_info.id}") return upload_id
def test_upload(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: expected_s3_headers = { 'content-length': '7557', 'x-archive-queue-derive': '1', 'x-archive-meta00-scanner': 'uri(Internet%20Archive%20Python%20library', 'x-archive-size-hint': '7557', 'content-md5': '6f1834f5c70c0eabf93dea675ccf90c4', 'x-archive-auto-make-bucket': '1', 'authorization': 'LOW test_access:test_secret', } rsps.add(responses.PUT, re.compile(r'.*s3.us.archive.org/.*'), adding_headers=expected_s3_headers, status=200) rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body={}, status=200) resp = upload('nasa', TEST_JSON_FILE, debug=True, access_key='test_access', secret_key='test_secret') for r in resp: p = r.prepare() headers = dict((k.lower(), str(v)) for k, v in p.headers.items()) scanner_header = '%20'.join( r.headers['x-archive-meta00-scanner'].split('%20')[:4]) headers['x-archive-meta00-scanner'] = scanner_header assert headers == expected_s3_headers assert p.url == '{0}//s3.us.archive.org/nasa/nasa_meta.json'.format( protocol)
def upload_to_ia(audio_filepath, title, this_ep_num): metadata = dict(collection = 'opensource_audio', title = 'EB' + str(int(this_ep_num)) + ' - ' + title, mediatype = 'audio', language = 'eng', description = 'The Encyclopedia Brunch podcast discusses ' + title, subject = [title, 'Encyclopedia Brunch', 'Podcast'], creator = 'T Dobbs, K Cogert', licenseurl = 'http://creativecommons.org/licenses/by-nc-nd/3.0/') r = upload('EB' + str(int(this_ep_num)) + '-' + title.replace(' ','_'), files={metadata['title'] + audio_filepath[audio_filepath.find('.'):] : audio_filepath}, metadata=metadata) return r[0].url
def upload(filename): if not os.path.isfile(os.environ.get("HOME") + "/" + ".config/ia.ini"): subprocess.call(["ia", "configure"]) itemname = ARCHIVE_ITEM_NAME print("[archive] Uploading {}/{}...".format(itemname, filename)) md = dict(title=itemname.upper(), mediatype='movies') r = internetarchive.upload(itemname, files=filename, metadata=md) print("[archive] Result {}".format(r[0].status_code)) return "{}/{}".format(itemname, filename)
def upload(ia_args, ia_files, ia_identifier): ia_files_new = [] for filename in ia_files: if not os.path.isfile(filename + '.upload'): ia_files_new.append(filename) ia_files = list(ia_files_new) if len(ia_files) > 0: for filename in ia_files: with open(filename + '.upload', 'w') as file: pass upload_response = internetarchive.upload(ia_identifier, filename, metadata = ia_args, access_key = ia_access_key, secret_key = ia_secret_key, queue_derive = True, verify = True, verbose = True, delete = True, retries = 5, retries_sleep = 300) os.remove(filename + '.upload')
def upload_to_internet_archive(self, link_guid): link = Link.objects.get(guid=link_guid) if not settings.UPLOAD_TO_INTERNET_ARCHIVE: return if not link.can_upload_to_internet_archive(): print "Not eligible for upload." return metadata = { "collection":settings.INTERNET_ARCHIVE_COLLECTION, "title":'%s: %s' % (link_guid, truncatechars(link.submitted_title, 50)), "mediatype":'web', "description":'Perma.cc archive of %s created on %s.' % (link.submitted_url, link.creation_timestamp,), "contributor":'Perma.cc', "submitted_url":link.submitted_url, "perma_url":"http://%s/%s" % (settings.HOST, link_guid), "external-identifier":'urn:X-perma:%s' % link_guid, } # set sponsor if organization exists if link.organization: metadata["sponsor"] = "%s - %s" % (link.organization, link.organization.registrar) identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link_guid with default_storage.open(link.warc_storage_file(), 'rb') as warc_file: success = internetarchive.upload( identifier, warc_file, access_key=settings.INTERNET_ARCHIVE_ACCESS_KEY, secret_key=settings.INTERNET_ARCHIVE_SECRET_KEY, retries=10, retries_sleep=60, verbose=True, ) if success: internetarchive.modify_metadata( identifier, metadata=metadata, ) link.uploaded_to_internet_archive = True link.save() else: self.retry(exc=Exception("Internet Archive reported upload failure.")) print "Failed." return success
def upload_single(self, name, f, ia_args): with open(settings.keys, 'r') as f: access_key, secret_key = f.read().strip().split(':') try: internetarchive.upload('archiveteam_newssites_{name}'.format(name=name), os.path.join(settings.dir_ready, f), metadata=ia_args, access_key=access_key, secret_key=secret_key, queue_derive=True, verify=True, verbose=True, delete=True, retries=10, retries_sleep=300) except: pass # see code below self.concurrent_uploads -= 1 os.remove(os.path.join(settings.dir_ready, f+'.upload')) if os.path.isfile(os.path.join(settings.dir_ready, f)): settings.irc_bot.send('PRIVMSG', '{name} uploaded unsuccessful.'.format( name=f), settings.irc_channel_bot)
def url2ia(url): """Creates an archive.org item for """ hr = requests.head(url) if not 'image/' in hr.headers['Content-Type']: raise ValueError( 'Service only works with urls with clearly ' 'identifiable images (e.g. ending in .png, .jpg, .gif, etc.') print(hr.headers['Content-Length']) if (int(hr.headers['Content-Length']) / 1000000.) > SIZE_LIMIT_MB: raise IOError('File size exceeds %smb' % SIZE_LIMIT_MB) filepath = download_file(url, path=PATH) filehash = get_filehash(filepath) filepath2 = os.path.join(PATH, filehash) os.rename(filepath, filepath2) ia.upload(URL2IIIF_ITEMNAME, filepath2, access_key=s3key, secret_key=s3secret) return filehash
def uploadDay(day): """ Upload the archives in the directory for this day to the Internet Archive """ try: imagecount, pagecount, stampaid = getDayCounts(day) md = getBasicItemData() md["title"], md["issue"], id_testata = readIssueMetadata(day) except FileNotFoundError: # Handle: FileNotFoundError: [Errno 2] No such file or directory: '1997-11-11/issue_metadata.json' print("WARNING: Day {} failed upon reading files".format(day)) return False md["title"] = md["title"] + " ({})".format(day) md["external-identifier"] = "urn:archiviolastampa:{}".format(stampaid) # md["originalurl"] = "http://www.archiviolastampa.it/index2.php?option=com_lastampa&task=issue&no_html=1&type=info&issueid={}".format(stampaid) md["date"] = day md["pages"] = pagecount md["description"] = "Numero intero del giorno {} dall'archivio storico La Stampa.".format( day) # TODO: Needle defaults to 01. Maybe read the prefix in the actual files instead? if id_testata == "02": identifier = "stampa-sera_{}".format(day) else: identifier = "lastampa_{}".format(day) try: item = get_item(identifier) if item and item.item_size and item.item_size > 5000000: print( "INFO: Day {} was already uploaded at {}, size {}. Skipping.". format(day, identifier, item.item_size)) return True iafiles = [day + '/' + arc.name for arc in Path(day).iterdir()] print("INFO: Uploading day {} with {} files".format(day, len(iafiles))) r = upload(identifier, files=iafiles, metadata=md, retries=5, retries_sleep=300) sleep(5) if r[0].status_code < 400: return True # FIXME: Specifically handle the various failures, like: # ResponseError('too many 502 error responses') # Please reduce your request rate. - total_tasks_queued exceeds global_limit except Exception as e: print("ERROR: Upload failed for day {}".format(day)) print(e) return False
def upload_ia_item(self): logger.debug("Uploading IA item for {}".format(self.ia_id)) if not self.has_image and not self.has_crop: logger.debug("No images to upload") return None files = [] if self.has_image: saved_image = self.save_image() files.append(saved_image) if self.has_crop: saved_crop = self.save_crop() files.append(saved_crop) internetarchive.upload(self.ia_id, files, metadata=self.ia_metadata, access_key=settings.IA_ACCESS_KEY_ID, secret_key=settings.IA_SECRET_ACCESS_KEY, checksum=False, verbose=True) if self.has_image: os.remove(saved_image) if self.has_crop: os.remove(saved_crop) return internetarchive.get_item(self.ia_id)
def upload_single_file(line, identifier, arc_file_path, configs, access, secret, debug): result = upload(identifier, files=arc_file_path, metadata=configs['metadata'], access_key=access, secret_key=secret, verbose=True, retries=10, retries_sleep=60, debug=debug) if hasattr(result[0], 'status_code'): print("{}\t{}".format(result[0].status_code, line)) else: print("{}\tstatus_code\t{}".format("ERROR", line))
def main(argv): args = docopt(__doc__, argv=argv) if args['--verbose'] and not args['--debug']: stdout.write('getting item: {0}\n'.format(args['<identifier>'])) upload_kwargs = dict(metadata=get_args_dict(args['--metadata']), headers=get_args_dict(args['--header']), debug=args['--debug'], queue_derive=args['--no-derive'], ignore_bucket=args['--ignore-bucket'], verbose=args['--verbose']) # Upload stdin. if args['<file>'] == ['-'] and not args['-']: stderr.write('--remote-name is required when uploading from stdin.\n') call(['ia', 'upload', '--help']) exit(1) if args['-']: local_file = TemporaryFile() local_file.write(stdin.read()) local_file.seek(0) upload_kwargs['remote_name'] = args['--remote-name'] # Upload files. else: local_file = args['<file>'] response = upload(args['<identifier>'], local_file, **upload_kwargs) if args['--debug']: for i, r in enumerate(response): if i != 0: stdout.write('---\n') headers = '\n'.join( [' {0}: {1}'.format(k, v) for (k, v) in r.headers.items()]) stdout.write('Endpoint:\n {0}\n\n'.format(r.url)) stdout.write('HTTP Headers:\n{0}\n'.format(headers)) else: for resp in response: if resp.status_code == 200: continue error = parseString(resp.content) code = get_xml_text(error.getElementsByTagName('Code')) msg = get_xml_text(error.getElementsByTagName('Message')) stderr.write('error "{0}" ({1}): {2}\n'.format( code, resp.status_code, msg)) exit(1)
def uploadToArchive(metadata): identifier = metadata['identifier'] flog = codecs.open('../logs/uploadLog.txt', 'a', 'utf-8') if len(identifier) > 100: print('File name too long: ' + identifier) flog.write('File name too long: ' + identifier + '\n----------\n') else: accession = metadata['Accession_No'] sr = metadata['Sr_No'] startMessage = sr+'#'+accession+'#'+identifier+'\n'+'Started at '+str(datetime.datetime.now()) print(startMessage) flog.write(startMessage+'\n') r = upload(identifier, {identifier+'.pdf': '../../ChunilalGandhiMSS/compressedPdfFiles/BOOK_NO.'+accession+'.pdf'}, metadata=metadata) endMessage=str(r[0].status_code)+'\n'+'Ended at '+str(datetime.datetime.now())+'\n----------\n' print(endMessage) flog.write(endMessage) flog.close()
def main(argv): args = docopt(__doc__, argv=argv) if args['--verbose'] and not args['--debug']: stdout.write('getting item: {0}\n'.format(args['<identifier>'])) upload_kwargs = dict( metadata=get_args_dict(args['--metadata']), headers=get_args_dict(args['--header']), debug=args['--debug'], queue_derive=True if args['--no-derive'] is False else False, ignore_preexisting_bucket=args['--ignore-bucket'], verbose=args['--verbose']) # Upload stdin. if args['<file>'] == ['-'] and not args['-']: stderr.write('--remote-name is required when uploading from stdin.\n') call(['ia', 'upload', '--help']) exit(1) if args['-']: local_file = TemporaryFile() local_file.write(stdin.read()) local_file.seek(0) upload_kwargs['remote_name'] = args['--remote-name'] # Upload files. else: local_file = args['<file>'] response = upload(args['<identifier>'], local_file, **upload_kwargs) if args['--debug']: for i, r in enumerate(response): if i != 0: stdout.write('---\n') headers = '\n'.join([' {0}: {1}'.format(k,v) for (k,v) in r.headers.items()]) stdout.write('Endpoint:\n {0}\n\n'.format(r.url)) stdout.write('HTTP Headers:\n{0}\n'.format(headers)) else: for resp in response: if resp.status_code == 200: continue error = parseString(resp.content) code = get_xml_text(error.getElementsByTagName('Code')) msg = get_xml_text(error.getElementsByTagName('Message')) stderr.write('error "{0}" ({1}): {2}\n'.format(code, resp.status_code, msg)) exit(1)
def test_upload(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: expected_s3_headers = { "content-length": "7557", "x-archive-queue-derive": "1", "x-archive-meta00-scanner": "uri(Internet%20Archive%20Python%20library", "x-archive-size-hint": "7557", "content-md5": "6f1834f5c70c0eabf93dea675ccf90c4", "x-archive-auto-make-bucket": "1", "authorization": "LOW test_access:test_secret", } rsps.add(responses.PUT, re.compile(r".*s3.us.archive.org/.*"), adding_headers=expected_s3_headers, status=200) rsps.add(responses.GET, "{0}//archive.org/metadata/nasa".format(protocol), body={}, status=200) resp = upload("nasa", TEST_JSON_FILE, debug=True, access_key="test_access", secret_key="test_secret") for r in resp: p = r.prepare() headers = dict((k.lower(), str(v)) for k, v in p.headers.items()) scanner_header = "%20".join(r.headers["x-archive-meta00-scanner"].split("%20")[:4]) headers["x-archive-meta00-scanner"] = scanner_header assert headers == expected_s3_headers assert p.url == "{0}//s3.us.archive.org/nasa/nasa_meta.json".format(protocol)
def upload_to_internet_archive(self, link_guid): try: link = Link.objects.get(guid=link_guid) if link.internet_archive_upload_status == 'failed_permanently': return except: print "Link %s does not exist" % link_guid return if not settings.UPLOAD_TO_INTERNET_ARCHIVE: return if not link.can_upload_to_internet_archive(): print "Not eligible for upload." return metadata = { "collection":settings.INTERNET_ARCHIVE_COLLECTION, "title":'%s: %s' % (link_guid, truncatechars(link.submitted_title, 50)), "mediatype":'web', "description":'Perma.cc archive of %s created on %s.' % (link.submitted_url, link.creation_timestamp,), "contributor":'Perma.cc', "submitted_url":link.submitted_url, "perma_url":"http://%s/%s" % (settings.HOST, link_guid), "external-identifier":'urn:X-perma:%s' % link_guid, } # set sponsor if organization exists if link.organization: metadata["sponsor"] = "%s - %s" % (link.organization, link.organization.registrar) identifier = settings.INTERNET_ARCHIVE_IDENTIFIER_PREFIX + link_guid try: if default_storage.exists(link.warc_storage_file()): item = internetarchive.get_item(identifier) # if item already exists (but has been removed), # ia won't update its metadata in upload function if item.exists and item.metadata['title'] == 'Removed': item.modify_metadata(metadata, access_key=settings.INTERNET_ARCHIVE_ACCESS_KEY, secret_key=settings.INTERNET_ARCHIVE_SECRET_KEY, ) with default_storage.open(link.warc_storage_file(), 'rb') as warc_file: success = internetarchive.upload( identifier, warc_file, metadata=metadata, access_key=settings.INTERNET_ARCHIVE_ACCESS_KEY, secret_key=settings.INTERNET_ARCHIVE_SECRET_KEY, retries=10, retries_sleep=60, verbose=True, ) if success: link.internet_archive_upload_status = 'completed' link.save() else: link.internet_archive_upload_status = 'failed' self.retry(exc=Exception("Internet Archive reported upload failure.")) return success else: link.internet_archive_upload_status = 'failed_permanently' link.save() except requests.ConnectionError as e: logger.exception("Upload to Internet Archive task failed because of a connection error. \nLink GUID: %s\nError: %s" % (link.pk, e)) return
def main(): print("Internet Archive Publisher") while True: file_id = input("File ID: ") if not file_id: break # Load file f = File.objects.get(pk=int(file_id)) print("Selected:", f, "(" + f.filename + ")") for base in BASES.keys(): print("[" + base + "]", BASES[base]["name"]) selected_base = input("Select package base: ").upper() base = BASES[selected_base] # Copy the zip zip_name = "zzt_" + f.filename shutil.copy( ZGAMES_PATH + f.download_url(), zip_name ) # Open the WIP zip with ZipFile(zip_name, "a") as z: # Insert the base files to_add = glob.glob( os.path.join(BASE_PATH, base["directory"], "*") ) for a in to_add: z.write(a, arcname=os.path.basename(a)) # Create ZZT.CFG if needed if base["use_cfg"]: # Find the relevant files to default to file_list = z.namelist() for idx, name in enumerate(file_list, start=1): print(idx, name) selected_idx = int(input("Launch which file? ")) - 1 launch_file = z.namelist()[selected_idx] config_content = launch_file[:-4] # Remove .ZZT extension if base["registered"]: config_content += "\r\nREGISTERED" z.writestr("ZZT.CFG", config_content) # Zip file is completed, prepare the upload meta = { "title": f.title, "mediatype": "software", "collection": "open_source_software", "emulator": "dosbox", "emulator_ext": "zip", "emulator_start": base["executable"] + " " + launch_file, "year": str(f.release_date)[:4], "subject": ["zzt"] + f.genre.split("/"), "creator": f.author.split("/"), "description": "World created using the ZZT engine." } print("Uploading to Internet Archive...") r = upload( base["prefix"] + f.filename[:-4], files=[zip_name], metadata=meta ) if r[0].status_code == 200: print("Upload successful!") f.archive_name = base["prefix"] + f.filename[:-4] f.save() print("https://archive.org/details/" + f.archive_name) os.remove(zip_name) else: print("Upload failed!") print(r) return True
def mirror(request, pk): """ Returns page to publish file on Archive.org """ f = File.objects.get(pk=pk) data = { "title": "Archive.org Mirror", "file": f, "ret": None, "packages": PACKAGE_PROFILES } package = int(request.GET.get("package", 0)) data["package"] = PACKAGE_PROFILES[package] data["split"] = math.ceil(len(data["packages"]) // 2) zip_file = zipfile.ZipFile(os.path.join(SITE_ROOT, f.download_url()[1:])) file_list = zip_file.namelist() file_list.sort(key=str.lower) data["file_list"] = file_list # Mirror the file if request.POST.get("mirror"): if request.POST.get("package") != "NONE": package = PACKAGE_PROFILES[int(request.POST.get("package", 0))] # Advanced settings zip_name = package["prefix"] + f.filename if request.POST.get("upload_name"): upload_name = request.POST["upload_name"] else: upload_name = zip_name[:-4] # Copy the base package zip shutil.copy( SITE_ROOT + f.download_url(), os.path.join(TEMP_PATH, zip_name) ) # Handle alternative Zip upload if request.FILES.get("alt_src"): with open(os.path.join(TEMP_PATH, zip_name), "wb") as fh: fh.write(request.FILES["alt_src"].read()) temp_zip = os.path.join(TEMP_PATH, zip_name) # Open the WIP zip with ZipFile(temp_zip, "a") as z: # Insert the base files to_add = glob.glob( os.path.join(BASE_PATH, package["directory"], "*") ) for a in to_add: z.write(a, arcname=os.path.basename(a)) # Create ZZT.CFG if needed if package.get("use_cfg"): config_content = request.POST.get("launch")[:-4].upper() # Remove .ZZT extension if package["registered"]: config_content += "\r\nREGISTERED" z.writestr("ZZT.CFG", config_content) # Create description description = "{}\n\n{}".format(package["auto_desc"], request.POST.get("description", "")) # Determine the launch command if request.POST.get("alt_launch"): launch_command = request.POST["alt_launch"] else: launch_command = package["executable"] + " " + request.POST.get("launch", "").upper() # Zip file is completed, prepare the upload meta = { "title": request.POST.get("title"), "mediatype": "software", "collection": ARCHIVE_COLLECTION, "emulator": "dosbox", "emulator_ext": "zip", "emulator_start": launch_command, "year": str(f.release_date)[:4], "subject": [package["engine"]] + f.genre.split("/"), "creator": f.author.split("/"), "description": description } if DEBUG: upload_name = "test-" + upload_name print("I'm gonna upload:", os.path.join(TEMP_PATH, zip_name)) file_path = os.path.join(TEMP_PATH, zip_name) r = upload( upload_name, files=[file_path], metadata=meta, access_key=IA_ACCESS, secret_key=IA_SECRET, ) if r[0].status_code == 200: data["status"] = "SUCCESS" f.archive_name = upload_name f.save() os.remove(os.path.join(TEMP_PATH, zip_name)) else: data["status"] = "FAILURE" data["archive_resp"] = r return render(request, "museum_site/tools/mirror.html", data)
# Move the original file to Internet Archive namespace shutil.copy(output_path, ssht.get_image_name()) # Save crop to Internet Archive namespace crop.save(open(ssht.get_crop_name(), 'w')) # Upload both images to Internet Archive files = [ssht.get_image_name(), ssht.get_crop_name()] try: logger.debug("Uploading to internetarchive as {}".format(ssht.ia_id)) internetarchive.upload( ssht.ia_id, files, metadata=ssht.ia_metadata, access_key=settings.IA_ACCESS_KEY_ID, secret_key=settings.IA_SECRET_ACCESS_KEY, checksum=False, verbose=True ) except Exception, e: logger.error("internetarchive error: %s" % e) ScreenshotLog.objects.create( update=update, site=site, message_type="error", message="internetarchive error: %s" % e ) ssht.delete() return False