Exemple #1
0
    def unsave_thumbnail(self, thumbnail_url: str, record_identifier: str):
        """Removes thumbnail from the local filesystem and from S3."""

        try:
            thumbnail_s3_key = os.path.relpath(
                urllib.parse.urlparse(
                    urllib.parse.unquote(thumbnail_url)).path, '/')
            filepath = os.path.join(
                os.path.abspath(os.environ.get('THUMBNAILS_DIRECTORY')),
                thumbnail_s3_key)
            os.remove(filepath)
            logging.debug('%s thumbnail removed from local filesystem at %s',
                          record_identifier, filepath)

            # TODO: clean up empty parent directories
            self.s3.delete_object(Bucket=os.environ.get('AWS_S3_BUCKET_NAME'),
                                  Key=thumbnail_s3_key)
            logging.debug('%s thumbnail removed from S3', record_identifier)
        except BotoCoreError as e:
            raise IndexerError('Failed to remove thumbnail from S3: {}'.format(
                e.msg))
        except Exception as e:
            raise IndexerError(
                'Failed to remove thumbnail from local filesystem: {}'.format(
                    e))
Exemple #2
0
    def _connect_external_services(self):
        """Initializes the interfaces for all third-party services NOT instantiated by this module."""

        try:
            solr_base_url = 'http://{}:{}/solr/{}'.format(
                os.environ.get('SOLR_HOST'), os.environ.get('SOLR_PORT'),
                os.environ.get('SOLR_CORE_NAME'))

            # Make sure we can connect to Solr.
            def solr_ping(base_url):
                """Raises an error if we can't connect to Solr."""
                o = urllib.parse.urlsplit(solr_base_url)
                ping_url = urllib.parse.urlunsplit(
                    o[:2] + (os.path.join(o.path, 'admin/ping'), ) + o[3:])
                requests.get(ping_url).raise_for_status()

            solr_ping(solr_base_url)

            self.solr = Solr(solr_base_url, always_commit=True)
            self.s3 = boto3.Session(
                aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
                aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
                region_name=os.environ.get('AWS_DEFAULT_REGION')).client('s3')
        except requests.exceptions.RequestException as e:
            raise IndexerError('Connection failed: {}'.format(e))
        except BotoCoreError as e:
            raise IndexerError('Failed to initialize S3 session: {}'.format(
                repr(e)))
Exemple #3
0
    def unsave_thumbnail(self, thumbnail_url: str, record_identifier: str,
                         institution_key: str, collection_keys: List[str]):
        """Removes thumbnail from the local filesystem and from S3."""

        try:
            thumbnail_s3_key = os.path.relpath(
                urllib.parse.urlparse(
                    urllib.parse.unquote(thumbnail_url)).path, '/')
            filepath = os.path.join(
                os.path.abspath(
                    os.path.expanduser(self.config['s3']['sync']['source'])),
                thumbnail_s3_key)
            os.remove(filepath)
            logging.debug('%s thumbnail removed from local filesystem at %s',
                          record_identifier, filepath)

            # TODO: clean up empty parent directories
            self.s3.delete_object(
                Bucket=self.config['s3']['sync']['destination']['s3_uri'],
                Key=thumbnail_s3_key)
            logging.debug('%s thumbnail removed from S3', record_identifier)
        except BotoCoreError as e:
            raise IndexerError('Failed to remove thumbnail from S3: {}'.format(
                e.msg))
        except Exception as e:
            raise IndexerError(
                'Failed to remove thumbnail from local filesystem: {}'.format(
                    e))
Exemple #4
0
    def update_record(self, path: str):
        """Updates a metadata record in PRL.
        
        Responds to IndexerEventHandler.on_modified filesystem event.
        """
        if not self.args['dry_run']:

            record_metadata = self.get_key_record_metadata(path)
            record_identifier = record_metadata[0]
            record_sets_serialized_encoded = self.record_sets.get(
                record_identifier.encode())

            # Generate a Solr document from the metadata record.
            with open(path, 'r', encoding='utf-8') as record_file:
                prl_solr_document = self.get_solr_document(record_file)

            # If there is a thumbnail, save it to the system.
            if prl_solr_document.original_thumbnail_metadata():
                self.save_thumbnail(prl_solr_document)

            record_identifier = prl_solr_document.id

            # Determine whether or not this is a create or an update.
            if record_sets_serialized_encoded is None:
                action = 'create'
            else:
                action = 'update'
                # If we've processed this record in the past, make sure we don't completely overwrite the collectionKey or collectionName fields.
                # We save these locally in LevelDB.
                record_sets = json.loads(
                    record_sets_serialized_encoded.decode())
                prl_solr_document.complete_collection_list(
                    record_sets['collectionKey'],
                    record_sets['collectionName'])

            pysolr_doc = prl_solr_document.get_pysolr_doc()
            collection_key = pysolr_doc['collectionKey']
            collection_name = pysolr_doc['collectionName']

            try:
                self.solr.add([pysolr_doc], overwrite=True)
                logging.debug('%s %sd in Solr', record_identifier, action)

                self.record_sets.put(
                    record_identifier.encode(),
                    json.dumps({
                        'collectionKey': collection_key,
                        'collectionName': collection_name
                    }).encode())
                logging.info('%s %sd in PRL', record_identifier, action)
            except plyvel.Error as e:
                self.solr.delete(id=record_identifier)
                raise IndexerError('Failed to PUT on LevelDB: {}'.format(e))
            except Exception as e:
                raise IndexerError(
                    'Failed to update Solr document: {}'.format(e))
        else:
            logging.info('DRY-RUN: %s updated in PRL', record_identifier)
Exemple #5
0
    def download_thumbnail(self, prl_solr_document: PRLSolrDocument):
        """Puts the thumbnail file in its place on the file system.

        Returns its path, or None if no thumbnail could be fetched."""

        # TODO: need better exception handling here
        thumbnail_s3_key = prl_solr_document.get_thumbnail_s3_key()
        try:
            filepath = os.path.join(
                os.path.abspath(
                    os.path.expanduser(self.config['s3']['sync']['source'])),
                thumbnail_s3_key)
            os.makedirs(os.path.dirname(filepath), exist_ok=True)

            original_thumbnail_url = prl_solr_document.original_thumbnail_metadata(
            )['url']
            n_tries = 3
            for try_i in range(1, n_tries + 1):
                try:
                    response = requests.get(original_thumbnail_url,
                                            timeout=30,
                                            stream=True)
                    # Fail on 4xx or 5xx
                    response.raise_for_status()
                    # Make sure the Content-Type is what we expect. Some servers discriminate against robots.
                    if re.match(re.compile('image/.+'),
                                response.headers.get('Content-Type')):
                        with open(filepath, 'wb') as image_file:
                            for chunk in response.iter_content(
                                    chunk_size=1024):
                                image_file.write(chunk)
                        logging.debug(
                            '%s thumbnail put on local filesystem at %s',
                            thumbnail_s3_key, filepath)
                        return filepath
                    else:
                        logging.debug('Robots cannot access %s',
                                      original_thumbnail_url)
                        return None
                except requests.Timeout as e:
                    if try_i < n_tries:
                        msg = 'Thumbnail download timed out, retrying...'
                        logging.info(msg)
                        # Continue loop
                    else:
                        # No more tries left, so fail
                        msg = 'Failed to download thumbnail after {} tries: {}'.format(
                            n_tries, str(e))
                        logging.debug(msg)
                        return None
                except (requests.RequestException, IOError) as e:
                    msg = 'Failed to download thumbnail: {}'.format(e)
                    logging.debug(msg)
                    return None
        except Exception as e:
            raise IndexerError(
                'Failed to put thumbnail on local filesystem: {}'.format(e))
Exemple #6
0
    def _disconnect_internal_services(self):
        """Closes connections with all third-party services instantiated by this module."""

        try:
            self.record_identifiers.close()
            self.harvester_settings.close()
        except plyvel.Error as e:
            raise IndexerError(
                'Failed to close the connection to LevelDB: {}'.format(e))
Exemple #7
0
    def update_record(self, path: str):
        """Updates a metadata record in PRL.
        
        Responds to IndexerEventHandler.on_modified filesystem event.
        """

        try:
            # Generate a Solr document from the metadata record.
            with open(path, 'r') as record_file:
                prl_solr_document = self.get_solr_document(record_file)
            pysolr_doc = prl_solr_document.get_pysolr_doc()
            record_identifier = prl_solr_document.get_record_identifier()

            if not self.args['dry_run']:
                if prl_solr_document.original_thumbnail_metadata():
                    thumbnail_saved = self.save_thumbnail(prl_solr_document)
                    if not thumbnail_saved:
                        prl_solr_document.discard_incorrect_thumbnail_url()
                try:
                    self.solr.add([pysolr_doc])
                    logging.debug('%s updated in Solr', record_identifier)
                    self.record_identifiers.put(path.encode(),
                                                record_identifier.encode())
                except plyvel.Error as e:
                    self.solr.delete(id=record_identifier)
                    raise IndexerError(
                        'Failed to PUT on LevelDB: {}'.format(e))
                except Exception as e:
                    raise IndexerError(
                        'Failed to update Solr document: {}'.format(e))

                logging.info('%s updated in PRL', record_identifier)
            else:
                logging.info('DRY-RUN: %s updated in PRL', record_identifier)
        except IndexerError as e:
            if self.args['dry_run']:
                logging.error('DRY-RUN: %s would not be updated in PRL: %s',
                              record_identifier, e)
            else:
                raise e
Exemple #8
0
    def _connect_internal_services(self):
        """Initializes the interfaces for all third-party services instantiated by this module."""

        try:
            self.record_identifiers = plyvel.DB(os.path.expanduser(
                self.config['leveldb']['record_identifiers']['path']),
                                                create_if_missing=True)
            self.harvester_settings = plyvel.DB(os.path.expanduser(
                self.config['leveldb']['harvester_settings']['path']),
                                                create_if_missing=True)
            self.set_harvester_settings()
        except plyvel.IOError as e:
            raise IndexerError(
                'Failed to instantiate LevelDB instance: {}'.format(repr(e)))
Exemple #9
0
    def _connect_internal_services(self):
        """Initializes the interfaces for all third-party services instantiated by this module."""

        try:
            self.harvester_settings = plyvel.DB(os.path.expanduser(
                os.environ.get('LEVELDB_HARVESTER_SETTINGS_DIRECTORY')),
                                                create_if_missing=True)
            self.record_sets = plyvel.DB(os.path.expanduser(
                os.environ.get('LEVELDB_RECORD_SETS_DIRECTORY')),
                                         create_if_missing=True)
            self.set_harvester_settings()
        except plyvel.IOError as e:
            raise IndexerError(
                'Failed to instantiate LevelDB instance: {}'.format(repr(e)))
Exemple #10
0
    def _connect_external_services(self):
        """Initializes the interfaces for all third-party services NOT instantiated by this module."""

        try:
            solr_base_url = self.config['solr']['base_url']

            # Make sure we can connect to Solr.
            def solr_ping(base_url):
                """Raises an error if we can't connect to Solr."""
                o = urllib.parse.urlsplit(solr_base_url)
                ping_url = urllib.parse.urlunsplit(
                    o[:2] + (os.path.join(o.path, 'admin/ping'), ) + o[3:])
                requests.get(ping_url).raise_for_status()

            solr_ping(solr_base_url)

            self.solr = Solr(solr_base_url, always_commit=True)
            self.s3 = boto3.Session(profile_name=self.config['s3']['configure']
                                    ['profile_name']).client('s3')
        except requests.exceptions.RequestException as e:
            raise IndexerError('Connection failed: {}'.format(e))
        except ProfileNotFound as e:
            raise IndexerError('Failed to initialize S3 session: {}'.format(
                repr(e)))
Exemple #11
0
    def upload_thumbnail(self, prl_solr_document: PRLSolrDocument,
                         filepath: str):
        """Puts the thumbnail on S3."""

        try:
            self.s3.put_object(
                Bucket=self.config['s3']['sync']['destination']['s3_uri'],
                Key=prl_solr_document.get_thumbnail_s3_key(),
                Body=open(filepath, 'rb'),
                ContentType=prl_solr_document.original_thumbnail_metadata()
                ['content-type'])
            logging.debug('%s thumbnail put on S3',
                          prl_solr_document.get_record_identifier())
        except BotoCoreError as e:
            raise IndexerError('Failed to put thumbnail on S3: {}'.format(
                e.msg))
Exemple #12
0
    def upload_thumbnail(self, prl_solr_document: PRLSolrDocument,
                         filepath: str):
        """Puts the thumbnail on S3."""

        # Determine a URL for the thumbnail now that we've downloaded it and know the image format
        prl_solr_document.add_thumbnail_url()

        try:
            self.s3.put_object(
                Bucket=os.environ.get('AWS_S3_BUCKET_NAME'),
                Key=prl_solr_document.get_thumbnail_s3_key(),
                Body=open(filepath, 'rb'),
                ContentType=prl_solr_document.original_thumbnail_metadata()
                ['content-type'])
            logging.debug('%s thumbnail put on S3',
                          prl_solr_document.get_record_identifier())
        except BotoCoreError as e:
            raise IndexerError('Failed to put thumbnail on S3: {}'.format(
                e.msg))
Exemple #13
0
    def remove_record(self, path: str):
        """Removes a metadata record from PRL.
        
        Responds to IndexerEventHandler.on_deleted filesystem event.
        """

        if not self.args['dry_run']:
            try:
                record_identifier = self.record_identifiers.get(
                    path.encode()).decode()
                docs = self.solr.search('id:"{0}"'.format(record_identifier))
                if len(docs) == 0:
                    raise IndexerError('Document not found in Solr: {}'.format(
                        record_identifier))
                elif len(docs) > 1:
                    # This should never happen. If it does, probably an issue with the schema.
                    raise IndexerError(
                        'Solr doesn\'t have unique IDs: {} records found with identifier {}'
                        .format(len(docs), record_identifier))
            except plyvel.Error as e:
                raise IndexerError('Failed to GET on LevelDB: {}'.format(e))
            except IndexerError as e:
                raise e
            except Exception as e:
                raise IndexerError(
                    'Failed to search for Solr document {}: {}'.format(
                        record_identifier, e))

            try:
                self.solr.delete(id=record_identifier)
                logging.debug('%s removed from Solr', record_identifier)
                self.record_identifiers.delete(path.encode())
                for doc in docs:
                    if 'thumbnail_url' in doc:
                        self.unsave_thumbnail(doc['thumbnail_url'],
                                              record_identifier,
                                              doc['institutionKey'],
                                              doc['collectionKey'])
                logging.info('%s removed from PRL', record_identifier)
            except plyvel.Error as e:
                raise IndexerError('Failed to DELETE on LevelDB: {}'.format(e))
            except IndexerError as e:
                raise e
            except Exception as e:
                raise IndexerError(
                    'Failed to remove Solr document: {}'.format(e))
        else:
            logging.info('DRY-RUN: Removed %s', path)
Exemple #14
0
    def get_oai_pmh_metadata(self, base_url: str) -> Dict[str, str]:
        """Returns a dictionary containing top-level metadata and set metadata of an OAI-PMH repository."""

        logging.debug(
            'Retrieving repository and set metadata from OAI-PMH repository %s',
            base_url)
        try:
            metadata = {}

            # All repositories should have this metadata.
            repository_metadata = Sickle(base_url, timeout=60).Identify()
            if hasattr(repository_metadata, 'repositoryIdentifier'):
                metadata[
                    'repository_identifier'] = repository_metadata.repositoryIdentifier
            if hasattr(repository_metadata, 'repositoryName'):
                metadata[
                    'repository_name'] = repository_metadata.repositoryName

            # Not all repositories will support sets.
            try:
                set_metadata = Sickle(base_url, timeout=60).ListSets()
                metadata.update({
                    'sets': {s.setSpec: s.setName
                             for s in list(set_metadata)}
                })
            except sickle.oaiexceptions.NoSetHierarchy as e:
                logging.debug(
                    'Failed to list sets from OAI-PMH repository %s: %s',
                    base_url, e)

            return metadata

        except requests.RequestException as e:
            raise IndexerError(
                'Failed to get repository metadata from OAI-PMH repository {}: {}'
                .format(base_url, e))
Exemple #15
0
    def read_harvester_settings_file(self,
                                     path: str) -> Dict[str, Dict[str, str]]:
        """Returns a dictionary representing the harvester settings.

        First, tries reading the settings as if the source file is UTF-8 encoded JSON of the following form (used for testing):

        {
            "harvester_settings_key_1": {
                "repository_name": "repository_name_1",
                "base_url": "http://example.edu/oai2",
                "set_spec": "set_spec_1",
                "split_by_set": False
            },
            ...
        }

        If that fails, tries reading the settings as if the source file is a serialized java.util.Hashtable instance from jOAI (used for production).
        """

        try:
            # See if it's in JSON already.
            with open(path, 'r') as harvester_settings_file:
                # Make sure we transform the key before storing.
                return {
                    self.get_harvester_settings_key(key): metadata
                    for key, metadata in json.load(
                        harvester_settings_file).items()
                }
        except JSONDecodeError as e:
            # Invalid JSON.
            raise IndexerError(
                'Cannot load scheduled harvests settings: {}'.format(e))
        except FileNotFoundError as e:
            # This file won't exist when no harvests have been scheduled, so it's probably fine.
            logging.debug(
                'Scheduled harvests settings file does not exist: {}'.format(
                    path))
            return {}
        except UnicodeDecodeError as e:
            logging.debug('Config file is not JSON: {}'.format(e))

            # Open the file in binary mode and try to parse it with javaobj.
            with open(path, 'rb') as harvester_settings_file:
                pobj = javaobj.loads(harvester_settings_file.read())

            is_scheduled_harvest = lambda h: JOAI_SCHEDULED_HARVEST_CLASSNAME in str(
                h)

            return {
                self.get_harvester_settings_key(pobj_harvest.harvestDir.path):
                {
                    'repository_name': pobj_harvest.repositoryName,
                    'base_url': pobj_harvest.baseURL,
                    'set_spec': pobj_harvest.setSpec,
                    'split_by_set': pobj_harvest.splitBySet
                }
                for pobj_harvest in list(
                    filter(is_scheduled_harvest, pobj.annotations))
            }
        except Exception as e:
            # Something else went wrong.
            raise IndexerError(
                'Cannot load scheduled harvests settings: {}'.format(e))
Exemple #16
0
    def remove_record(self, path: str):
        """Removes a metadata record from PRL.
        
        Responds to IndexerEventHandler.on_deleted filesystem event.
        """
        if not self.args['dry_run']:
            try:
                record_metadata = self.get_key_record_metadata(path)
                record_identifier = record_metadata[0]
                # We're certain that our serialized JSON is valid.
                record_sets = json.loads(
                    self.record_sets.get(record_identifier.encode()).decode())
            except plyvel.Error as e:
                raise IndexerError('Failed to GET on LevelDB: {}'.format(e))

            # Either remove the record from the system, or update it.
            if len(record_sets['collectionKey']) == 1:
                # Remove the thumbnail if there is one.
                try:
                    pysolr_doc = self.solr.search(
                        'id:"{0}"'.format(record_identifier)).docs[0]
                except Exception as e:
                    raise IndexerError('Failed to GET {} from Solr: {}'.format(
                        record_identifier, e))
                if 'thumbnail_url' in pysolr_doc:
                    self.unsave_thumbnail(pysolr_doc['thumbnail_url'],
                                          record_identifier)

                # Remove the document from Solr.
                try:
                    self.solr.delete(id=record_identifier)
                except Exception as e:
                    raise IndexerError(
                        'Failed to DELETE {} from Solr: {}'.format(
                            record_identifier, e))
                logging.debug('%s removed from Solr', record_identifier)

                try:
                    self.record_sets.delete(record_identifier.encode())
                except plyvel.Error as e:
                    raise IndexerError(
                        'Failed to DELETE on LevelDB: {}'.format(e))

                logging.info('%s removed from PRL', record_identifier)
            else:
                # Update the list of collections that the record belongs to.
                # This is the case when a record belongs to more than one OAI-PMH set.
                collection_key = list(
                    filter(lambda x: x != record_metadata[3],
                           record_sets['collectionKey']))
                collection_name = list(
                    filter(lambda x: x != record_metadata[4],
                           record_sets['collectionName']))

                pysolr_doc = {
                    'id': record_identifier,
                    'collectionKey': collection_key,
                    'collectionName': collection_name
                }

                try:
                    self.solr.add([pysolr_doc],
                                  fieldUpdates={
                                      'collectionKey': 'set',
                                      'collectionName': 'set'
                                  },
                                  overwrite=True)
                except Exception as e:
                    raise IndexerError('Failed to POST {} on Solr: {}'.format(
                        record_identifier, e))
                logging.debug(
                    '%s updated in Solr (removed from collection %s)',
                    record_identifier, record_metadata[3])

                try:
                    self.record_sets.put(
                        record_identifier.encode(),
                        json.dumps({
                            'collectionKey': collection_key,
                            'collectionName': collection_name
                        }).encode())
                except plyvel.Error as e:
                    raise IndexerError(
                        'Failed to PUT on LevelDB: {}'.format(e))

                logging.info('%s updated in PRL (removed from collection %s)',
                             record_identifier, record_metadata[3])
        else:
            logging.info('DRY-RUN: Removed %s', path)
Exemple #17
0
    def read_harvester_settings_file(self) -> Dict[str, Dict[str, str]]:
        """Returns a dictionary representing the harvester settings.

        First, tries reading the settings as if the source file is UTF-8 encoded JSON of the following form (used for testing):

        {
            "harvester_settings_key_1": {
                "repository_name": "repository_name_1",
                "base_url": "http://example.edu/oai2",
                "set_spec": "set_spec_1",
                "split_by_set": False
            },
            ...
        }

        If that fails, tries reading the settings as if the source file is a serialized java.util.Hashtable instance from jOAI (used for production).
        """

        harvester_settings_path = self.get_harvester_settings_path()

        try:
            # See if it's in JSON already.
            with open(harvester_settings_path, 'r') as harvester_settings_file:
                # Make sure we transform the key before storing.
                return {
                    self.get_harvester_settings_key(key): metadata
                    for key, metadata in json.load(
                        harvester_settings_file).items()
                }
        except JSONDecodeError as e:
            # Invalid JSON.
            raise IndexerError(
                'Cannot load scheduled harvests settings: {}'.format(e))
        except UnicodeDecodeError as e:
            logging.debug('Config file is not JSON: {}'.format(e))

            # Open the file in binary mode and try to parse it with javaobj.
            with open(harvester_settings_path,
                      'rb') as harvester_settings_file:
                pobj = JavaObjectUnmarshaller(
                    harvester_settings_file).readObject()

            scheduled_harvest_class = self.config['leveldb'][
                'harvester_settings']['source']['classes']['scheduled_harvest']
            is_scheduled_harvest = lambda h: scheduled_harvest_class in str(h)

            return {
                self.get_harvester_settings_key(pobj_harvest.harvestDir.path):
                {
                    'repository_name': pobj_harvest.repositoryName,
                    'base_url': pobj_harvest.baseURL,
                    'set_spec': pobj_harvest.setSpec,
                    'split_by_set': pobj_harvest.splitBySet
                }
                for pobj_harvest in list(
                    filter(is_scheduled_harvest, pobj.annotations))
            }
        except Exception as e:
            # Something else went wrong.
            raise IndexerError(
                'Cannot load scheduled harvests settings: {}'.format(e))
Exemple #18
0
    def get_key_record_metadata(self, file_path: str):
        """Determines collection and institution metadata from the filepath of the record.

        Returns a 5-tuple containing the following elements:
            - an identifier for the record
            - an identifier for the institution
            - a human-readable string for the institution
            - an identifier for the collection
            - a human-readable string for the collection

        Side effects:
            - updates local LevelDB cache with OAI-PMH repository metadata
        """

        # ---------------------------------------- #
        # --- Gather all the data we can find. --- #
        # ---------------------------------------- #

        # Get the record identifier from the filename.
        identifier = urllib.parse.unquote(
            os.path.splitext(os.path.basename(file_path))[0])

        try:
            # The harvester settings will tell us how to get the other metadata.
            harvester_settings_key = None

            potential_harvester_settings_keys = map(
                self.get_harvester_settings_key, [
                    os.path.dirname(file_path),
                    os.path.dirname(os.path.dirname(file_path))
                ])
            # Keep track of keys that we tried, but failed.
            tried_keys = []

            for potential_harvester_settings_key in potential_harvester_settings_keys:
                potential_harvester_settings_serialized_encoded = self.harvester_settings.get(
                    potential_harvester_settings_key.encode())

                if potential_harvester_settings_serialized_encoded:
                    # Found it!
                    harvester_settings_key = potential_harvester_settings_key
                    break
                else:
                    tried_keys.append(potential_harvester_settings_key)

            if harvester_settings_key is not None:
                harvester_settings_serialized_encoded = potential_harvester_settings_serialized_encoded
                harvester_settings_serialized = harvester_settings_serialized_encoded.decode(
                )
                harvester_settings = json.loads(harvester_settings_serialized)
            else:
                # This should never happen. Harvester settings should represent all harvested files.
                raise IndexerError(
                    'Cannot find harvester settings in LevelDB for {}'.format(
                        tried_keys))

        except plyvel.Error as e:
            # We can't go on without LevelDB.
            raise IndexerError('Failed to GET on LevelDB: {}'.format(e))
        except AttributeError as e:
            # This should never happen. Harvester settings should represent all harvested files.
            raise IndexerError(
                'Cannot find harvester settings in LevelDB for {}'.format(
                    harvester_settings_key))
        except JSONDecodeError as e:
            # This should never happen.
            raise IndexerError(
                'Harvester settings are not valid JSON: {}'.format(e))

        base_url = harvester_settings['base_url']
        institution_name = harvester_settings['repository_name']
        set_spec = harvester_settings['set_spec']
        split_by_set = harvester_settings['split_by_set']

        # Fetch repository metadata, and write to the in-memory cache if necessary.
        if base_url in self.oai_pmh_cache:
            oai_pmh_metadata = self.oai_pmh_cache[base_url]
        else:
            oai_pmh_metadata = self.get_oai_pmh_metadata(base_url)
            self.oai_pmh_cache[base_url] = oai_pmh_metadata

        # ----------------------------------------- #
        # --- Determine which values to return. --- #
        # ----------------------------------------- #

        # This is the most common case: an institution specifies a specific set for us to harvest.
        individual_set_harvest = set_spec != '' and not split_by_set

        # This is the case when an institution wants us to harvest all sets from their repository.
        full_repository_harvest = set_spec == '' and split_by_set

        # This is the case when an institution wants us to treat their entire repository as a PRL "collection".
        single_collection_repository = set_spec == '' and not split_by_set

        # Set the return values.
        if individual_set_harvest:
            institution_key = os.path.dirname(harvester_settings_key)
            collection_key = set_spec
            collection_name = oai_pmh_metadata['sets'][set_spec]

        elif full_repository_harvest:
            institution_key = harvester_settings_key
            collection_key = os.path.basename(os.path.dirname(file_path))
            collection_name = oai_pmh_metadata['sets'][set_spec]

        elif single_collection_repository:
            institution_key = os.path.dirname(harvester_settings_key)
            collection_key = os.path.basename(harvester_settings_key)
            collection_name = oai_pmh_metadata['repository_name']
        else:
            raise IndexerError(
                'Unable to handle harvest configuration: {}'.format(
                    harvester_settings_key))

        return (identifier, institution_key, institution_name, collection_key,
                collection_name)