def __init__(self, metadata, **kwargs): #super(InternetArchive, self).__init__(**kwargs) self.SCRIBE_METADATA = metadata self.s = get_ia_session() self.SECURE = self.s.secure log.info("Initialized InternetArchive object")
def _update_set_book_repub_state(self): state = -2 errors = [] self.dispatch_progress('Setting republisher state to {}'.format(state)) for _ in range(scribe_globals.TASK_DEFAULT_MAX_RETRIES): try: self.dispatch_progress( '[{}/{}] Setting republisher state to {}'.format( _ + 1, scribe_globals.TASK_DEFAULT_MAX_RETRIES, state)) ia_item = get_ia_session().get_item(self.identifier) resp = ia_item.modify_metadata({'repub_state': state}) self.logger.info('Response from cluster: {} | ' 'Headers {}'.format(resp.text, resp.headers)) except Exception as e: self.logger.error( '[{}/{}]Transient error {} while setting repub_state to {}.' .format(_ + 1, scribe_globals.TASK_DEFAULT_MAX_RETRIES, e, state)) errors.append(e) continue else: break else: self.logger.error( 'Could not set repub_state to {} because {}'.format( state, errors[-1])) payload = { 'task_type': 'MARCMetadataViaDWWITask', 'selector': self.isbn, 'type': 'isbn', 'errors': json.dumps([str(x) for x in errors]), 'attempts': len(errors), } push_event('tts-task-exception', payload)
def get_pending_catalog_tasks(identifier): ia_session = get_ia_session() item = ia_session.get_item(identifier) tasks = item.get_task_summary() tasks_list = [ x for x in item.tasks if x.get('status') in ['running', 'queued'] ] if item.tasks else [] return tasks['running'] + tasks['queued'], tasks_list
def _download_ia_item(self): self.dispatch_progress('Downloading ia item') session = get_ia_session() self._ia_item = session.get_item(self.identifier) error_message = self._validate_ia_item() if error_message: self._should_delete_book = False self.error = ValueError(error_message)
def _verify_preconditions(self): self.dispatch_progress('Verifying preconditions') if not self.book.has_identifier(): self.ABLE_TO_CHANGE_REPUB_STATE = False item = get_ia_session().get_item(self.book.identifier) if not item.exists: self.ABLE_TO_CHANGE_REPUB_STATE = False if item.metadata.get('repub_state') != '{}'.format( self.ALLOWED_REPUB_STATE): self.ABLE_TO_CHANGE_REPUB_STATE = False self.ABLE_TO_CHANGE_REPUB_STATE = True return True
def send_rejection(book): book.logger.info('Sending metrics to IA') metadata = book.metadata slip_metadata = book.get_slip_metadata() metadata.update(slip_metadata) book.logger.info('Sending this metadata to the upst') session = get_ia_session() namespace = get_rejection_namespace(slip_metadata) result, payload = send_rejection_event(session, book, metadata, namespace) if result: book.logger.info('Metric successfully sent. Now deleting....') book.do_move_to_trash() else: raise payload
def upload_book(book): Logger = book.logger Logger.debug('Starting upload of ' + book['identifier']) _check_preconditons(book) #book.do_book_upload_begin() _set_upload_lock_file(book, Logger) responses_dict = {} book_upload_total_start = time.time() try: scandata = ScanData(book['path']) zip_path = _check_preimage_is_valid(book) ia_session = get_ia_session() item = ia_session.get_item(book['identifier']) Logger.info('Got item {}'.format(item.identifier)) if not book.force_upload: _check_remote_preconditons(item, Logger) encoded_md = _prepare_metadata(book, item, Logger) metasource_file_location, metasource_file_upload_name = _generate_metasource( book, Logger) responses = [] book_upload_phase_start = time.time() needs_metadata_pushed = item.exists doing_foldouts = os.path.exists( os.path.join(book['path'], 'send_to_station')) book_preimage_upload_start, \ book_preimage_upload_end, \ sizes_dict = _upload_book_files( zip_path, book, encoded_md, item, responses, metasource_file_location, metasource_file_upload_name, Logger) if needs_metadata_pushed: _only_push_metadata(encoded_md, book, item, responses, Logger) book_upload_phase_end = time.time() _upload_logs(book=book, item=item, responses=responses) _verify_responses(responses, Logger) Logger.debug('OK! Finished uploads to {} | Took {}s'.format( book['identifier'], book_upload_phase_end - book_upload_phase_start)) book.do_upload_book_end() _push_metrics(book, scandata, encoded_md, sizes_dict, doing_foldouts, responses, responses_dict, book_upload_phase_start, book_upload_phase_end, book_upload_total_start, book_preimage_upload_start, book_preimage_upload_end) if config.is_true('show_book_notifications'): notifications_manager.add_notification( title='Uploaded', message="{} has been successfully uploaded.".format( book['identifier']), book=book) Logger.debug('Finished upload for ' + book['identifier']) # Clock.schedule_once(partial(self.update_status_callback, book)) time.sleep(10) # Wait for book to be added to metadata api except requests.ConnectionError as e: book.do_upload_book_error() Logger.error(traceback.format_exc()) payload = { 'local_id': book['uuid'], 'status': book['status'], 'exception': str(e) } push_event('tts-book-failed-upload', payload, 'book', book['identifier']) raise ScribeException('Upload Failed. ' 'Please check network and S3 Keys') except Exception as e: book.do_upload_book_error() Logger.error(traceback.format_exc()) payload = { 'local_id': book['uuid'], 'status': book['status'], 'responses': responses_dict, 'exception': str(e) } push_event('tts-book-upload-exception', payload, 'book', book['identifier']) raise ScribeException('Upload Failed! - {}'.format(str(e))) finally: book.force_upload = False Logger.info("Removing upload lock file at {}".format( join(book['path'], "upload_lock"))) os.remove(join(book['path'], "upload_lock"))
def upload_book_foldouts(book,): try: Logger = book.logger Logger.info('upload_book_foldouts: Uploading foldouts for book ' '{}'.format(book)) ia_session = get_ia_session() book_item = ia_session.get_item(book['identifier'], ) _check_preconditions(book, book_item, Logger) book_folder = 'foldouts' cdic, tdic, rdic, rtdic = _create_scandata(book, book_folder, True, Logger) responses = [] # Upload the pictures Logger.debug('upload_book_foldouts: Uploading pics') book.update_message('Foldouts upload | Images') if cdic != {}: res = book_item.upload(cdic, retries=10, verify=True, retries_sleep=60, queue_derive=False) responses.append(res) if tdic != {}: res = book_item.upload(tdic, retries=10, verify=True, retries_sleep=60, queue_derive=False) responses.append(res) try: if rdic != {}: res = book_item.upload(rdic, retries=10, verify=True, retries_sleep=60, queue_derive=False) responses.append(res) if rtdic != {}: res = book_item.upload(rtdic, retries=10, verify=True, retries_sleep=60, queue_derive=False) responses.append(res) except requests.exceptions.ConnectionError as e: Logger.error( 'upload_book_foldouts: Connection exception {} ' 'has occurred at rdic upload time; aborting!'.format(str(e))) raise e except Exception as e: Logger.error('upload_book_foldouts: Error {} has occurred at rdic upload time'.format(e)) raise e Logger.debug('upload_book_foldouts: Done. Uploading scandata...') # Upload the scandata target_scandata = 'scandata.json' book.update_message('Foldouts upload | Scandata') scandata = join(book['path'], 'scandata_rerepublished.json') upload_res = book_item.upload({target_scandata: scandata}, retries=10, retries_sleep=60, queue_derive=False, verify=True,) if os.path.exists(os.path.join(book['path'], 'scanning.log')): book.update_message('Foldouts upload | Log') book.logger.debug( 'Uploading Scanning log file' ) upload_name_mapping = \ {'logs/' + book['identifier'] + '_scanning_{:%Y-%m-%d%H:%M:%S}.log'.format(datetime.now()): join(book['path'], 'scanning.log')} response = book_item.upload(upload_name_mapping, retries=10, retries_sleep=60, queue_derive=False, verbose=True,verify=True, ) responses.append(response) url_to_status_code = \ {r.request.url: r.status_code for r in response} book.logger.debug('Response from upload: {} | {}' .format(response, url_to_status_code)) responses.append(upload_res) # corrections_uploaded # flatten responses list: flat_responses = [y for x in responses for y in x] for response in flat_responses: Logger.info('{} returned {}'.format(response.url, response.status_code)) if response.status_code != 200: raise Exception('upload_book_foldouts: Response code {} {} - {} from cluster. ' 'URL was: {} | content: {}' 'This is an error. Upload will be attempted again.' .format(response.status_code, response.reason, response.text if 'text' in response else "", response.url, response.content)) Logger.debug('Done. Changing repub state...') _change_repub_state(book_item, 43) _remove_book_from_btserver_item(book, Logger) book.do_upload_foldouts_done() payload = { 'repub_state': 43, 'responses': flat_responses, } push_event('tts-book-corrections-sent', payload, 'book', book['identifier']) Logger.debug('All done.') return except requests.ConnectionError as e: raise ScribeException('Upload Failed. Please check network and ' 'S3 Keys (Error was: {})'.format(e)) except Exception as e: book.do_upload_foldouts_fail() book.raise_exception(e)
def _load_item(self): self.dispatch_progress('Asking Archive.org for {}'.format( self.identifier)) self.ia_item = get_ia_session().get_item(self.identifier)
def get_repub_state(book): session = get_ia_session() item = session.get_item(book.identifier) repub_state = int(item.metadata['repub_state']) return repub_state
def item_ready_for_upload(book): '''Book items might have already been preloaded with metadata in the IA scan process. However, prevent uploading to ia items which already have images uploaded. Called in worker thread. ''' try: session = get_ia_session() item = session.get_item(book.identifier) if not item.exists: if book: preloaded_path = os.path.join(book.path, 'preloaded') if os.path.exists(preloaded_path): # This item was created in offline mode, but the # identifier doesn't exist book.logger.error( 'Item {0} is tagged as preloaded, but ' 'the identifier does not exist. Aborting ' 'upload and reverting to scribing ' 'status.'.format(book.identifier)) return False else: book.logger.info('Item does not exist and user wants to ' 'upload to item {0}. Ok\'ing that'.format( book.identifier)) # no existing item, so safe to use this identifier return True allowed_formats = { 'Metadata', 'MARC', 'MARC Source', 'MARC Binary', 'Dublin Core', 'Archive BitTorrent', 'Web ARChive GZ', 'Web ARChive', 'Log', 'OCLC xISBN JSON', 'Internet Archive ARC', 'Internet Archive ARC GZ', 'CDX Index', 'Item CDX Index', 'Item CDX Meta-Index', 'WARC CDX Index', 'Metadata Log' } ALLOWED_ITEM_FILE_NAMES = [ '{}_{}'.format(book.identifier, x) for x in ALLOWED_VARIABLE_FILE_NAMES ] for item_file_metadata in item.files: if item_file_metadata['format'] not in allowed_formats: # Ignore new style in-item thumb files if item_file_metadata['name'] in ALLOWED_FILE_NAMES: book.logger.info( 'File {} ({}) is present in ' 'remote item and allowed: continuing...'.format( item_file_metadata['name'], item_file_metadata['format'])) continue elif item_file_metadata['name'] in ALLOWED_ITEM_FILE_NAMES: continue # files have already been uploaded to this item book.logger.error( 'File {} in item {} is blocking upload.'.format( item_file_metadata, item.identifier)) return False except Exception: book.logger.error(traceback.format_exc()) raise ScribeException('Could not check status of IA item {}'.format( book.identifier)) return True
def verify_uploaded(book): ia_session = get_ia_session() book.logger.info( 'verify_uploaded: Verifying {} was uploaded to the cluster.'.format( book)) # we do have identifier in the book dictionary, but we only trust # what's on the drive for this one identifier = book.identifier if not identifier: book.logger.info( 'verify_uploaded: No identifier.txt. Assuming empty book and deleting.' .format(book)) return True book.logger.info( 'verify_uploaded: Read {} from identifier.txt.'.format(book)) # gather data i = ia_session.get_item(identifier) repub_state = int( i.metadata['repub_state']) if 'repub_state' in i.metadata else None book.logger.info('verify_uploaded: repub_state {}'.format(repub_state)) scandate = datetime.strptime( i.metadata['scandate'], '%Y%m%d%H%M%S') if 'scandate' in i.metadata else None book.logger.info('verify_uploaded: scandate {}'.format(scandate)) #scanner = i.metadata['scanner'] if 'scanner' in i.metadata else None #book.logger.info('verify_uploaded: scanner {}'.format(scanner)) #this_scanner = config.get('identifier', 0) tasks_running, tasks_list = get_pending_catalog_tasks(i) book.logger.info( 'verify_uploaded: pending book_tasks {}'.format(tasks_running)) local_imgcount = int(ScanData(book.path).count_pages()) remote_imgcount = int( i.metadata['imagecount']) if 'imagecount' in i.metadata else None book.logger.info('verify_uploaded: local pages: {} ' '| remote pages: {}'.format(local_imgcount, remote_imgcount)) # These are here so you can bypass one easily by setting it to True scandate_ok = False repub_state_ok = False tasks_running_ok = False #scanner_ok = False imgcount_ok = True # policies if not repub_state: repub_state_ok = True elif repub_state > 10: repub_state_ok = True threshold = config.get_numeric_or_none('defer_delete_by') if threshold and scandate: if not datetime.now() - timedelta( hours=threshold) <= scandate <= datetime.now(): scandate_ok = True else: # If the user doesn't specify a value, delete immediately scandate_ok = True if tasks_running == 0: tasks_running_ok = True if remote_imgcount: if local_imgcount == remote_imgcount: imgcount_ok = True else: imgcount_ok = True # aggregate and return ret = scandate_ok \ and repub_state_ok and tasks_running_ok \ and imgcount_ok if book.force_delete: ret = True book.logger.info( 'verify_uploaded: Do selectors allow for deletion?' ' scandate ok: {} | repub_state_ok {} ' '| book_tasks ok: {} | imgcount_ok: {} | Force delete: {}-->>> ' 'VERDICT: {}'.format(scandate_ok, repub_state_ok, tasks_running_ok, imgcount_ok, book.force_delete, ret)) return ret
def _load_item(self): self.dispatch_progress('Loading IA item') self.item = get_ia_session().get_item(self.book.identifier)
def _get_ia_session(self): self.dispatch_progress('Getting IA session') self._ia_session = get_ia_session()