Esempio n. 1
0
    def __init__(self, metadata, **kwargs):
        #super(InternetArchive, self).__init__(**kwargs)
        self.SCRIBE_METADATA = metadata

        self.s = get_ia_session()
        self.SECURE = self.s.secure
        log.info("Initialized InternetArchive object")
Esempio n. 2
0
    def _update_set_book_repub_state(self):
        state = -2
        errors = []
        self.dispatch_progress('Setting republisher state to {}'.format(state))
        for _ in range(scribe_globals.TASK_DEFAULT_MAX_RETRIES):
            try:
                self.dispatch_progress(
                    '[{}/{}] Setting republisher state to {}'.format(
                        _ + 1, scribe_globals.TASK_DEFAULT_MAX_RETRIES, state))
                ia_item = get_ia_session().get_item(self.identifier)
                resp = ia_item.modify_metadata({'repub_state': state})
                self.logger.info('Response from cluster: {} | '
                                 'Headers {}'.format(resp.text, resp.headers))
            except Exception as e:
                self.logger.error(
                    '[{}/{}]Transient error {} while setting repub_state to {}.'
                    .format(_ + 1, scribe_globals.TASK_DEFAULT_MAX_RETRIES, e,
                            state))
                errors.append(e)
                continue
            else:
                break
        else:
            self.logger.error(
                'Could not set repub_state to {} because {}'.format(
                    state, errors[-1]))
            payload = {
                'task_type': 'MARCMetadataViaDWWITask',
                'selector': self.isbn,
                'type': 'isbn',
                'errors': json.dumps([str(x) for x in errors]),
                'attempts': len(errors),
            }

            push_event('tts-task-exception', payload)
Esempio n. 3
0
def get_pending_catalog_tasks(identifier):
    ia_session = get_ia_session()
    item = ia_session.get_item(identifier)
    tasks = item.get_task_summary()
    tasks_list = [
        x for x in item.tasks if x.get('status') in ['running', 'queued']
    ] if item.tasks else []
    return tasks['running'] + tasks['queued'], tasks_list
Esempio n. 4
0
 def _download_ia_item(self):
     self.dispatch_progress('Downloading ia item')
     session = get_ia_session()
     self._ia_item = session.get_item(self.identifier)
     error_message = self._validate_ia_item()
     if error_message:
         self._should_delete_book = False
         self.error = ValueError(error_message)
Esempio n. 5
0
 def _verify_preconditions(self):
     self.dispatch_progress('Verifying preconditions')
     if not self.book.has_identifier():
         self.ABLE_TO_CHANGE_REPUB_STATE = False
     item = get_ia_session().get_item(self.book.identifier)
     if not item.exists:
         self.ABLE_TO_CHANGE_REPUB_STATE = False
     if item.metadata.get('repub_state') != '{}'.format(
             self.ALLOWED_REPUB_STATE):
         self.ABLE_TO_CHANGE_REPUB_STATE = False
     self.ABLE_TO_CHANGE_REPUB_STATE = True
     return True
Esempio n. 6
0
def send_rejection(book):
    book.logger.info('Sending metrics to IA')
    metadata = book.metadata
    slip_metadata = book.get_slip_metadata()
    metadata.update(slip_metadata)
    book.logger.info('Sending this metadata to the upst')
    session = get_ia_session()
    namespace = get_rejection_namespace(slip_metadata)
    result, payload = send_rejection_event(session, book, metadata, namespace)
    if result:
        book.logger.info('Metric successfully sent. Now deleting....')
        book.do_move_to_trash()
    else:
        raise payload
Esempio n. 7
0
def upload_book(book):
    Logger = book.logger
    Logger.debug('Starting upload of ' + book['identifier'])

    _check_preconditons(book)

    #book.do_book_upload_begin()

    _set_upload_lock_file(book, Logger)

    responses_dict = {}
    book_upload_total_start = time.time()
    try:
        scandata = ScanData(book['path'])

        zip_path = _check_preimage_is_valid(book)

        ia_session = get_ia_session()
        item = ia_session.get_item(book['identifier'])
        Logger.info('Got item {}'.format(item.identifier))

        if not book.force_upload:
            _check_remote_preconditons(item, Logger)

        encoded_md = _prepare_metadata(book, item, Logger)

        metasource_file_location, metasource_file_upload_name = _generate_metasource(
            book, Logger)

        responses = []
        book_upload_phase_start = time.time()

        needs_metadata_pushed = item.exists

        doing_foldouts = os.path.exists(
            os.path.join(book['path'], 'send_to_station'))

        book_preimage_upload_start, \
        book_preimage_upload_end, \
        sizes_dict                  = _upload_book_files( zip_path, book,
                                                        encoded_md, item, responses,
                                                        metasource_file_location,
                                                        metasource_file_upload_name,
                                                        Logger)

        if needs_metadata_pushed:
            _only_push_metadata(encoded_md, book, item, responses, Logger)

        book_upload_phase_end = time.time()

        _upload_logs(book=book, item=item, responses=responses)

        _verify_responses(responses, Logger)

        Logger.debug('OK! Finished uploads to {} | Took {}s'.format(
            book['identifier'],
            book_upload_phase_end - book_upload_phase_start))

        book.do_upload_book_end()

        _push_metrics(book, scandata, encoded_md, sizes_dict, doing_foldouts,
                      responses, responses_dict, book_upload_phase_start,
                      book_upload_phase_end, book_upload_total_start,
                      book_preimage_upload_start, book_preimage_upload_end)

        if config.is_true('show_book_notifications'):
            notifications_manager.add_notification(
                title='Uploaded',
                message="{} has been successfully uploaded.".format(
                    book['identifier']),
                book=book)

        Logger.debug('Finished upload for ' + book['identifier'])

        # Clock.schedule_once(partial(self.update_status_callback, book))
        time.sleep(10)  # Wait for book to be added to metadata api
    except requests.ConnectionError as e:

        book.do_upload_book_error()
        Logger.error(traceback.format_exc())
        payload = {
            'local_id': book['uuid'],
            'status': book['status'],
            'exception': str(e)
        }

        push_event('tts-book-failed-upload', payload, 'book',
                   book['identifier'])

        raise ScribeException('Upload Failed. '
                              'Please check network and S3 Keys')
    except Exception as e:

        book.do_upload_book_error()
        Logger.error(traceback.format_exc())

        payload = {
            'local_id': book['uuid'],
            'status': book['status'],
            'responses': responses_dict,
            'exception': str(e)
        }

        push_event('tts-book-upload-exception', payload, 'book',
                   book['identifier'])

        raise ScribeException('Upload Failed! - {}'.format(str(e)))
    finally:
        book.force_upload = False
        Logger.info("Removing upload lock file at {}".format(
            join(book['path'], "upload_lock")))
        os.remove(join(book['path'], "upload_lock"))
Esempio n. 8
0
def upload_book_foldouts(book,):
    try:
        Logger = book.logger
        Logger.info('upload_book_foldouts: Uploading foldouts for book '
                    '{}'.format(book))

        ia_session = get_ia_session()
        book_item = ia_session.get_item(book['identifier'], )

        _check_preconditions(book, book_item, Logger)

        book_folder = 'foldouts'

        cdic, tdic, rdic, rtdic = _create_scandata(book, book_folder, True, Logger)

        responses = []
        # Upload the pictures
        Logger.debug('upload_book_foldouts: Uploading pics')
        book.update_message('Foldouts upload | Images')
        if cdic != {}:
            res = book_item.upload(cdic, retries=10, verify=True,
                                   retries_sleep=60, queue_derive=False)
            responses.append(res)

        if tdic != {}:
            res = book_item.upload(tdic, retries=10, verify=True,
                                   retries_sleep=60, queue_derive=False)
            responses.append(res)

        try:
            if rdic != {}:
                res = book_item.upload(rdic, retries=10, verify=True,
                                       retries_sleep=60, queue_derive=False)
                responses.append(res)

            if rtdic != {}:
                res = book_item.upload(rtdic, retries=10, verify=True,
                                       retries_sleep=60, queue_derive=False)
                responses.append(res)
        except requests.exceptions.ConnectionError as e:
            Logger.error(
                'upload_book_foldouts: Connection exception {} '
                'has occurred at rdic upload time; aborting!'.format(str(e)))
            raise e
        except Exception as e:
            Logger.error('upload_book_foldouts: Error {} has occurred at rdic upload time'.format(e))
            raise e

        Logger.debug('upload_book_foldouts: Done. Uploading scandata...')
        # Upload the scandata

        target_scandata = 'scandata.json'
        book.update_message('Foldouts upload | Scandata')
        scandata = join(book['path'], 'scandata_rerepublished.json')
        upload_res = book_item.upload({target_scandata: scandata},
                                      retries=10,
                                      retries_sleep=60,
                                      queue_derive=False,
                                      verify=True,)

        if os.path.exists(os.path.join(book['path'], 'scanning.log')):
            book.update_message('Foldouts upload | Log')
            book.logger.debug(
                'Uploading Scanning log file'
            )
            upload_name_mapping = \
                {'logs/' + book['identifier']
                 + '_scanning_{:%Y-%m-%d%H:%M:%S}.log'.format(datetime.now()):
                     join(book['path'], 'scanning.log')}
            response = book_item.upload(upload_name_mapping, retries=10,
                                   retries_sleep=60, queue_derive=False, verbose=True,verify=True, )
            responses.append(response)
            url_to_status_code = \
                {r.request.url: r.status_code for r in response}
            book.logger.debug('Response from upload: {} | {}'
                              .format(response, url_to_status_code))

        responses.append(upload_res)
        # corrections_uploaded

        # flatten responses list:
        flat_responses = [y for x in responses for y in x]
        for response in flat_responses:
            Logger.info('{} returned {}'.format(response.url, response.status_code))
            if response.status_code != 200:
                raise Exception('upload_book_foldouts: Response code {} {} - {} from cluster. '
                                'URL was: {} | content: {}'
                                'This is an error. Upload will be attempted again.'
                                .format(response.status_code,
                                        response.reason,
                                        response.text if 'text' in response else "",
                                        response.url,
                                        response.content))

        Logger.debug('Done. Changing repub state...')

        _change_repub_state(book_item, 43)

        _remove_book_from_btserver_item(book, Logger)

        book.do_upload_foldouts_done()

        payload = {
            'repub_state': 43,
            'responses': flat_responses,

        }
        push_event('tts-book-corrections-sent', payload,
                   'book', book['identifier'])
        Logger.debug('All done.')

        return

    except requests.ConnectionError as e:
        raise ScribeException('Upload Failed. Please check network and '
                              'S3 Keys (Error was: {})'.format(e))
    except Exception as e:
        book.do_upload_foldouts_fail()
        book.raise_exception(e)
Esempio n. 9
0
 def _load_item(self):
     self.dispatch_progress('Asking Archive.org for {}'.format(
         self.identifier))
     self.ia_item = get_ia_session().get_item(self.identifier)
Esempio n. 10
0
def get_repub_state(book):
    session = get_ia_session()
    item = session.get_item(book.identifier)
    repub_state = int(item.metadata['repub_state'])
    return repub_state
Esempio n. 11
0
def item_ready_for_upload(book):
    '''Book items might have already been preloaded with metadata in the
    IA scan process. However, prevent uploading to ia items which already
    have images uploaded.

    Called in worker thread.
    '''

    try:
        session = get_ia_session()
        item = session.get_item(book.identifier)

        if not item.exists:
            if book:
                preloaded_path = os.path.join(book.path, 'preloaded')
                if os.path.exists(preloaded_path):
                    # This item was created in offline mode, but the
                    # identifier doesn't exist
                    book.logger.error(
                        'Item {0} is tagged as preloaded, but '
                        'the identifier does not exist. Aborting '
                        'upload and reverting to scribing '
                        'status.'.format(book.identifier))
                    return False
                else:
                    book.logger.info('Item does not exist and user wants to '
                                     'upload to item {0}. Ok\'ing that'.format(
                                         book.identifier))
                    # no existing item, so safe to use this identifier
                    return True
        allowed_formats = {
            'Metadata', 'MARC', 'MARC Source', 'MARC Binary', 'Dublin Core',
            'Archive BitTorrent', 'Web ARChive GZ', 'Web ARChive', 'Log',
            'OCLC xISBN JSON', 'Internet Archive ARC',
            'Internet Archive ARC GZ', 'CDX Index', 'Item CDX Index',
            'Item CDX Meta-Index', 'WARC CDX Index', 'Metadata Log'
        }

        ALLOWED_ITEM_FILE_NAMES = [
            '{}_{}'.format(book.identifier, x)
            for x in ALLOWED_VARIABLE_FILE_NAMES
        ]

        for item_file_metadata in item.files:
            if item_file_metadata['format'] not in allowed_formats:
                # Ignore new style in-item thumb files
                if item_file_metadata['name'] in ALLOWED_FILE_NAMES:
                    book.logger.info(
                        'File {} ({}) is present in '
                        'remote item and allowed: continuing...'.format(
                            item_file_metadata['name'],
                            item_file_metadata['format']))
                    continue
                elif item_file_metadata['name'] in ALLOWED_ITEM_FILE_NAMES:
                    continue
                # files have already been uploaded to this item
                book.logger.error(
                    'File {} in item {} is blocking upload.'.format(
                        item_file_metadata, item.identifier))
                return False

    except Exception:
        book.logger.error(traceback.format_exc())
        raise ScribeException('Could not check status of IA item {}'.format(
            book.identifier))

    return True
Esempio n. 12
0
def verify_uploaded(book):

    ia_session = get_ia_session()

    book.logger.info(
        'verify_uploaded: Verifying {} was uploaded to the cluster.'.format(
            book))

    # we do have identifier in the book dictionary, but we only trust
    # what's on the drive for this one
    identifier = book.identifier
    if not identifier:
        book.logger.info(
            'verify_uploaded: No identifier.txt. Assuming empty book and deleting.'
            .format(book))
        return True

    book.logger.info(
        'verify_uploaded: Read {} from identifier.txt.'.format(book))

    # gather data

    i = ia_session.get_item(identifier)

    repub_state = int(
        i.metadata['repub_state']) if 'repub_state' in i.metadata else None
    book.logger.info('verify_uploaded: repub_state {}'.format(repub_state))

    scandate = datetime.strptime(
        i.metadata['scandate'],
        '%Y%m%d%H%M%S') if 'scandate' in i.metadata else None
    book.logger.info('verify_uploaded: scandate {}'.format(scandate))

    #scanner = i.metadata['scanner'] if 'scanner' in i.metadata else None
    #book.logger.info('verify_uploaded: scanner {}'.format(scanner))
    #this_scanner = config.get('identifier', 0)

    tasks_running, tasks_list = get_pending_catalog_tasks(i)
    book.logger.info(
        'verify_uploaded: pending book_tasks {}'.format(tasks_running))

    local_imgcount = int(ScanData(book.path).count_pages())
    remote_imgcount = int(
        i.metadata['imagecount']) if 'imagecount' in i.metadata else None
    book.logger.info('verify_uploaded: local pages: {} '
                     '| remote pages: {}'.format(local_imgcount,
                                                 remote_imgcount))

    # These are here so you can bypass one easily by setting it to True
    scandate_ok = False
    repub_state_ok = False
    tasks_running_ok = False
    #scanner_ok = False
    imgcount_ok = True

    # policies
    if not repub_state:
        repub_state_ok = True
    elif repub_state > 10:
        repub_state_ok = True

    threshold = config.get_numeric_or_none('defer_delete_by')
    if threshold and scandate:
        if not datetime.now() - timedelta(
                hours=threshold) <= scandate <= datetime.now():
            scandate_ok = True
    else:
        # If the user doesn't specify a value, delete immediately
        scandate_ok = True

    if tasks_running == 0:
        tasks_running_ok = True

    if remote_imgcount:
        if local_imgcount == remote_imgcount:
            imgcount_ok = True
    else:
        imgcount_ok = True

    # aggregate and return
    ret = scandate_ok \
          and repub_state_ok and tasks_running_ok \
          and imgcount_ok

    if book.force_delete:
        ret = True

    book.logger.info(
        'verify_uploaded: Do selectors allow for deletion?'
        ' scandate ok: {} |  repub_state_ok {} '
        '|  book_tasks ok: {} | imgcount_ok: {} | Force delete: {}-->>> '
        'VERDICT: {}'.format(scandate_ok, repub_state_ok, tasks_running_ok,
                             imgcount_ok, book.force_delete, ret))

    return ret
Esempio n. 13
0
 def _load_item(self):
     self.dispatch_progress('Loading IA item')
     self.item = get_ia_session().get_item(self.book.identifier)
Esempio n. 14
0
 def _get_ia_session(self):
     self.dispatch_progress('Getting IA session')
     self._ia_session = get_ia_session()