Python OCLCClassifyCoverageProvider Examples, oclc_classify.OCLCClassifyCoverageProvider Python Examples

Example #1

0

Show file

 def __init__(self, test_session=None):
     # Allows tests to run without db session overlap.
     if test_session:
         self._session = test_session
     self.coverage = LinkedDataCoverageProvider(self._db)
     self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
     self.viaf = VIAFClient(self._db)

Example #2

0

Show file

 def setup(self):
     super(TestOCLCClassifyCoverageProvider, self).setup()
     self.edition, ignore = self._edition(
         with_license_pool=True, data_source_name=DataSource.GUTENBERG
     )
     self.identifier = self.edition.primary_identifier
     self.api = MockOCLCClassifyAPI()
     self.provider = OCLCClassifyCoverageProvider(self._db, api=self.api)

Example #3

0

Show file

File: test_oclc_classify.py Project: NYPL-Simplified/metadata_wrangler

    def setup(self):
        super(TestOCLCClassifyCoverageProvider, self).setup()

        self.edition = self._edition(with_license_pool=True)[0]
        self.identifier = self.edition.primary_identifier
        self.api = MockOCLCClassifyAPI()
        self.provider = OCLCClassifyCoverageProvider(self._db, api=self.api)

Example #4

0

Show file

 def __init__(self, test_session=None):
     # Allows tests to run without db session overlap.
     if test_session:
         self._session = test_session
     self.coverage = LinkedDataCoverageProvider(self._db)
     self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
     self.viaf = VIAFClient(self._db)

Example #5

0

Show file

File: coverage.py Project: rskm1/metadata_wrangler

    def providers(self):
        """Instantiate required and optional CoverageProviders.

        All Identifiers in this Collection's catalog will be run
        through each provider. If an optional provider fails, nothing
        will happen.  If a required provider fails, the coverage
        operation as a whole will fail.

        NOTE: This method creates CoverageProviders that go against
        real servers. Because of this, tests must use a subclass that
        mocks providers(), such as
        MockIdentifierResolutionCoverageProvider.
        """
        # All books must be run through Content Cafe and OCLC
        # Classify, assuming their identifiers are of the right
        # type.
        content_cafe = ContentCafeCoverageProvider(self._db,
                                                   api=self.content_cafe_api,
                                                   uploader=self.uploader)
        oclc_classify = OCLCClassifyCoverageProvider(self._db)

        optional = []
        required = [content_cafe, oclc_classify]

        # All books derived from OPDS import against the open-access
        # content server must be looked up in that server.
        #
        # TODO: This could stand some generalization. Any OPDS server
        # that also supports the lookup protocol can be used here.
        if (self.collection.protocol == ExternalIntegration.OPDS_IMPORT
                and self.collection.data_source
                and self.collection.data_source.name
                == DataSource.OA_CONTENT_SERVER):
            required.append(LookupClientCoverageProvider(self.collection))

        # All books obtained from Overdrive must be looked up via the
        # Overdrive API.
        if self.collection.protocol == ExternalIntegration.OVERDRIVE:
            required.append(
                OverdriveBibliographicCoverageProvider(
                    self.collection, api_class=self.overdrive_api))
        return optional, required

Example #6

0

Show file

class TestOCLCClassifyCoverageProvider(DatabaseTest):

    def setup(self):
        super(TestOCLCClassifyCoverageProvider, self).setup()
        self.edition, ignore = self._edition(
            with_license_pool=True, data_source_name=DataSource.GUTENBERG
        )
        self.identifier = self.edition.primary_identifier
        self.api = MockOCLCClassifyAPI()
        self.provider = OCLCClassifyCoverageProvider(self._db, api=self.api)

    def sample_data(self, filename):
        return sample_data(filename, 'oclc_classify')

    def test_oclc_safe_title(self):
        # Returns an empty string when passed None.
        eq_(self.provider.oclc_safe_title(None), '')

        # Returns the original title if it has no special characters.
        title = 'The Curious Incident of the Dog in the Night-Time'
        eq_(self.provider.oclc_safe_title(title), title)

        # Returns a title without special characters otherwise.
        title = '3 Blind Mice & Other Tales: A Bedtime Reader'
        expected = '3 Blind Mice  Other Tales A Bedtime Reader'
        eq_(self.provider.oclc_safe_title(title), expected)

    def test_process_item_without_book_information(self):
        def process_item():
            lookup = self.sample_data('jane_eyre.xml')
            self.api.queue_lookup(lookup)
            return self.provider.process_item(self.identifier)

        # Create an edition without a title
        self.edition.title = None

        result = process_item()
        eq_(True, isinstance(result, CoverageFailure))
        eq_(True, result.exception.endswith('title and author!'))

        # Create an edition without an author
        self.edition.title = u"Jane Eyre"
        self._db.delete(self.edition.contributions[0])
        self._db.commit()

        result = process_item()
        eq_(True, isinstance(result, CoverageFailure))
        eq_(True, result.exception.endswith('title and author!'))

        # Create an edition with both a title and author
        bronte = self._contributor(sort_name="Bronte, Charlotte")[0]
        self.edition.add_contributor(bronte, Contributor.AUTHOR_ROLE)

        result = process_item()
        eq_(result, self.identifier)

    def test_process_item_when_parsing_error_occurs(self):
        class AlwaysErrorsClassifyProvider(OCLCClassifyCoverageProvider):
            def parse_edition_data(self, *args, **kwargs):
                raise IOError('It broke!')

        provider = AlwaysErrorsClassifyProvider(self._db, api=self.api)
        self.api.queue_lookup(self.sample_data('jane_eyre.xml'))
        result = provider.process_item(self.identifier)

        eq_(True, isinstance(result, CoverageFailure))
        eq_(self.identifier, result.obj)
        eq_('It broke!', result.exception)
        eq_(provider.data_source, result.data_source)

Example #7

0

Show file

File: test_oclc_classify.py Project: NYPL-Simplified/metadata_wrangler

class TestOCLCClassifyCoverageProvider(DatabaseTest):
    def setup(self):
        super(TestOCLCClassifyCoverageProvider, self).setup()

        self.edition = self._edition(with_license_pool=True)[0]
        self.identifier = self.edition.primary_identifier
        self.api = MockOCLCClassifyAPI()
        self.provider = OCLCClassifyCoverageProvider(self._db, api=self.api)

    def sample_data(self, filename):
        return sample_data(filename, "oclc_classify")

    def test_oclc_safe_title(self):
        # Returns an empty string when passed None.
        eq_(self.provider.oclc_safe_title(None), "")

        # Returns the original title if it has no special characters.
        title = "The Curious Incident of the Dog in the Night-Time"
        eq_(self.provider.oclc_safe_title(title), title)

        # Returns a title without special characters otherwise.
        title = "3 Blind Mice & Other Tales: A Bedtime Reader"
        expected = "3 Blind Mice  Other Tales A Bedtime Reader"
        eq_(self.provider.oclc_safe_title(title), expected)

    def test_process_item_without_book_information(self):
        def process_item():
            lookup = self.sample_data("jane_eyre.xml")
            self.api.queue_lookup(lookup)
            return self.provider.process_item(self.identifier)

        # Create an edition without a title
        self.edition.title = None

        result = process_item()
        eq_(True, isinstance(result, CoverageFailure))
        eq_(True, result.exception.endswith("title and author!"))

        # Create an edition without an author
        self.edition.title = "Jane Eyre"
        self._db.delete(self.edition.contributions[0])
        self._db.commit()

        result = process_item()
        eq_(True, isinstance(result, CoverageFailure))
        eq_(True, result.exception.endswith("title and author!"))

        # Create an edition with both a title and author
        bronte = self._contributor(sort_name="Bronte, Charlotte")[0]
        self.edition.add_contributor(bronte, Contributor.AUTHOR_ROLE)

        result = process_item()
        eq_(result, self.identifier)

    def test_process_item_when_parsing_error_occurs(self):
        class AlwaysErrorsClassifyProvider(OCLCClassifyCoverageProvider):
            def parse_edition_data(self, *args, **kwargs):
                raise IOError("It broke!")

        provider = AlwaysErrorsClassifyProvider(self._db, api=self.api)
        self.api.queue_lookup(self.sample_data("jane_eyre.xml"))
        result = provider.process_item(self.identifier)

        eq_(True, isinstance(result, CoverageFailure))
        eq_(self.identifier, result.obj)
        eq_("It broke!", result.exception)
        eq_(provider.output_source, result.data_source)

Example #8

0

Show file

class RedoOCLCForThreeMScript(Script):
    def __init__(self, test_session=None):
        # Allows tests to run without db session overlap.
        if test_session:
            self._session = test_session
        self.coverage = LinkedDataCoverageProvider(self._db)
        self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
        self.viaf = VIAFClient(self._db)

    @property
    def input_data_source(self):
        return DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)

    def do_run(self):
        """Re-runs OCLC Linked Data coverage provider to get viafs. Fetches
        author information and recalculates presentation."""
        identifiers = self.fetch_authorless_threem_identifiers()
        self.delete_coverage_records(identifiers)
        self.ensure_isbn_identifier(identifiers)
        for identifier in identifiers:
            self.coverage.ensure_coverage(identifier)
            self.merge_contributors(identifier)
            # Recalculate everything so the contributors can be seen.
            for contributor in identifier.primary_edition.contributors:
                self.viaf.process_contributor(contributor)
            identifier.primary_edition.calculate_presentation()
            if identifier.licensed_through:
                identifier.licensed_through.calculate_work()

    def fetch_authorless_threem_identifiers(self):
        """Returns a list of ThreeM identifiers that don't have contributors"""
        qu = self._db.query(Identifier).join(Identifier.primarily_identifies)
        qu = qu.outerjoin(
            Edition.contributions).filter(Contribution.id == None)
        qu = qu.filter(Identifier.type == Identifier.THREEM_ID)
        return qu.all()

    def delete_coverage_records(self, identifiers):
        """Deletes existing OCLC Linked Data coverage records to re-run and
        capture author data"""
        t1 = self._db.begin_nested()

        for identifier in identifiers:
            for coverage_record in identifier.coverage_records:
                if coverage_record.data_source == self.input_data_source:
                    self._db.delete(coverage_record)

        t1.commit()

    def ensure_isbn_identifier(self, identifiers):
        """Runs OCLCClassify to get ISBN numbers if they're not available."""
        identifiers_without_isbn = []
        for identifier in identifiers:
            equivalencies = identifier.equivalencies
            equivalent_types = [eq.output.type for eq in equivalencies]
            if Identifier.ISBN not in equivalent_types:
                identifiers_without_isbn.append(identifier)

        for identifier in identifiers_without_isbn:
            self.oclc_classify.ensure_coverage(identifier)

    def merge_contributors(self, identifier):
        """Gives a ThreeM primary edition any contributors found via OCLC-LD"""
        qu = self._db.query(Identifier).join(Identifier.inbound_equivalencies)
        qu = qu.filter(
            or_(Identifier.type == Identifier.OCLC_WORK,
                Identifier.type == Identifier.OCLC_NUMBER)).filter(
                    Equivalency.input_id == identifier.id)

        oclc_contributions = []
        for oclc_identifier in qu.all():
            editions = oclc_identifier.primarily_identifies
            for edition in editions:
                oclc_contributions += edition.contributions

        for contribution in oclc_contributions:
            for edition in identifier.primarily_identifies:
                edition.add_contributor(contribution.contributor,
                                        contribution.role)

Example #9

0

Show file

class RedoOCLCForThreeMScript(Script):

    def __init__(self, test_session=None):
        # Allows tests to run without db session overlap.
        if test_session:
            self._session = test_session
        self.coverage = LinkedDataCoverageProvider(self._db)
        self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
        self.viaf = VIAFClient(self._db)

    @property
    def input_data_source(self):
        return DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)

    def do_run(self):
        """Re-runs OCLC Linked Data coverage provider to get viafs. Fetches
        author information and recalculates presentation."""
        identifiers = self.fetch_authorless_threem_identifiers()
        self.delete_coverage_records(identifiers)
        self.ensure_isbn_identifier(identifiers)
        for identifier in identifiers:
            self.coverage.ensure_coverage(identifier)
            self.merge_contributors(identifier)
            # Recalculate everything so the contributors can be seen.
            for contributor in identifier.primary_edition.contributors:
                self.viaf.process_contributor(contributor)
            identifier.primary_edition.calculate_presentation()
            if identifier.licensed_through:
                identifier.licensed_through.calculate_work()

    def fetch_authorless_threem_identifiers(self):
        """Returns a list of ThreeM identifiers that don't have contributors"""
        qu = self._db.query(Identifier).join(Identifier.primarily_identifies)
        qu = qu.outerjoin(Edition.contributions).filter(Contribution.id==None)
        qu = qu.filter(Identifier.type == Identifier.THREEM_ID)
        return qu.all()

    def delete_coverage_records(self, identifiers):
        """Deletes existing OCLC Linked Data coverage records to re-run and
        capture author data"""
        t1 = self._db.begin_nested()

        for identifier in identifiers:
            for coverage_record in identifier.coverage_records:
                if coverage_record.data_source == self.input_data_source:
                    self._db.delete(coverage_record)

        t1.commit()

    def ensure_isbn_identifier(self, identifiers):
        """Runs OCLCClassify to get ISBN numbers if they're not available."""
        identifiers_without_isbn = []
        for identifier in identifiers:
            equivalencies = identifier.equivalencies
            equivalent_types = [eq.output.type for eq in equivalencies]
            if Identifier.ISBN not in equivalent_types:
                identifiers_without_isbn.append(identifier)

        for identifier in identifiers_without_isbn:
            self.oclc_classify.ensure_coverage(identifier)

    def merge_contributors(self, identifier):
        """Gives a ThreeM primary edition any contributors found via OCLC-LD"""
        qu = self._db.query(Identifier).join(Identifier.inbound_equivalencies)
        qu = qu.filter(or_(
            Identifier.type == Identifier.OCLC_WORK,
            Identifier.type == Identifier.OCLC_NUMBER
        )).filter(Equivalency.input_id == identifier.id)

        oclc_contributions = []
        for oclc_identifier in qu.all():
            editions = oclc_identifier.primarily_identifies
            for edition in editions:
                oclc_contributions += edition.contributions

        for contribution in oclc_contributions:
            for edition in identifier.primarily_identifies:
                edition.add_contributor(contribution.contributor, contribution.role)