Ejemplo n.º 1
0
 def __init__(self, test_session=None):
     # Allows tests to run without db session overlap.
     if test_session:
         self._session = test_session
     self.coverage = LinkedDataCoverageProvider(self._db)
     self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
     self.viaf = VIAFClient(self._db)
Ejemplo n.º 2
0
    def resolve_viaf(self, work):
        """Get VIAF data on all contributors."""

        viaf = VIAFClient(self._db)
        for pool in work.license_pools:
            edition = pool.presentation_edition
            for contributor in edition.contributors:
                viaf.process_contributor(contributor)
                if not contributor.display_name:
                    contributor.family_name, contributor.display_name = (
                        contributor.default_names())
Ejemplo n.º 3
0
 def __init__(self, _db, *args, **kwargs):
     if 'api' in kwargs:
         self.api = kwargs['api']
         del kwargs['api']
     else:
         self.api = OCLCLinkedData(_db)
     if 'viaf_api' in kwargs:
         self.viaf = kwargs['viaf_api']
         del kwargs['viaf_api']
     else:
         self.viaf = VIAFClient(_db)
     super(LinkedDataCoverageProvider, self).__init__(_db, *args, **kwargs)
Ejemplo n.º 4
0
 def __init__(self, test_session=None):
     # Allows tests to run without db session overlap.
     if test_session:
         self._session = test_session
     self.coverage = LinkedDataCoverageProvider(self._db)
     self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
     self.viaf = VIAFClient(self._db)
Ejemplo n.º 5
0
    def __init__(self, collection, *args, **kwargs):
        _db = Session.object_session(collection)
        api = kwargs.pop('api', None)
        if not api:
            api = OCLCLinkedData(_db)
        self.api = api

        viaf = kwargs.pop('viaf', None)
        if not viaf:
            viaf = VIAFClient(_db)
        self.viaf = viaf

        kwargs['registered_only'] = True
        super(LinkedDataCoverageProvider,
              self).__init__(collection, *args, **kwargs)
Ejemplo n.º 6
0
    def __init__(self, collection, viaf=None, **kwargs):
        _db = Session.object_session(collection)
        api_class = kwargs.pop('api_class', OverdriveAPI)
        if callable(api_class):
            api = self.generic_overdrive_api(_db, api_class)
        else:
            # The API 'class' is actually an object, probably a mock.
            api = api_class
        if not api:
            raise CannotLoadConfiguration(
                """OverdriveBibliographicCoverageProvider requires at least one fully configured Overdrive collection."""
            )

        self.viaf = viaf or VIAFClient(_db)

        kwargs['registered_only'] = True
        super(OverdriveBibliographicCoverageProvider,
              self).__init__(collection, api_class=api, **kwargs)
Ejemplo n.º 7
0
    def __init__(self,
                 collection,
                 uploader=None,
                 viaf_client=None,
                 linked_data_coverage_provider=None,
                 content_cafe_api=None,
                 overdrive_api_class=OverdriveAPI,
                 **kwargs):

        super(IdentifierResolutionCoverageProvider,
              self).__init__(collection, **kwargs)

        # Since we are the metadata wrangler, any resources we find,
        # we mirror to S3.
        if not uploader:
            uploader = S3Uploader.from_config(self._db)
        self.uploader = uploader

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        self.policy = PresentationCalculationPolicy(
            regenerate_opds_entries=True)

        self.overdrive_api = self.create_overdrive_api(overdrive_api_class)

        self.content_cafe_api = content_cafe_api

        # Determine the optional and required coverage providers.
        # Each Identifier in this Collection's catalog will be run
        # through all relevant providers.
        self.required_coverage_providers, self.optional_coverage_providers = self.providers(
        )

        # When we need to look up a contributor via VIAF we will use this
        # client.
        self.viaf_client = viaf_client or VIAFClient(self._db)

        # Books are not looked up in OCLC Linked Data directly, since
        # there is no Collection that identifies a book by its OCLC Number.
        # However, when a book is looked up through OCLC Classify, some
        # OCLC Numbers may be associated with it, and _those_ numbers
        # can be run through OCLC Linked Data.
        #
        # TODO: We get many books identified by ISBN, and those books
        # _could_ be run through a LinkedDataCoverageProvider if it
        # worked a little differently. However, I don't think this
        # would be very useful, since those books will get looked up
        # through OCLC Classify, which will probably result in us
        # finding that same ISBN via OCLC Number.
        self.oclc_linked_data = (linked_data_coverage_provider
                                 or LinkedDataCoverageProvider(
                                     self._db, viaf_api=self.viaf_client))

        # The ordinary OverdriveBibliographicCoverageProvider
        # doesn't upload images, so we need to create our own
        # mirror and scaler.
        #
        # TODO: This class would be neater if we were to subclass
        # OverdriveBibliographicCoverageProvider to do the scaling and
        # uploading.
        self.image_mirrors = {
            DataSource.OVERDRIVE:
            OverdriveCoverImageMirror(self._db, uploader=uploader)
        }
        self.image_scaler = ImageScaler(self._db,
                                        self.image_mirrors.values(),
                                        uploader=uploader)
Ejemplo n.º 8
0
 def __init__(self, viaf=None):
     self.viaf = viaf or VIAFClient(self._db)
Ejemplo n.º 9
0
 def run(self):
     """Fill in all author names with information from VIAF."""
     VIAFClient(self._db).run(self.force)
Ejemplo n.º 10
0
 def setup(self):
     super(TestVIAFClient, self).setup()
     self.client = VIAFClient(self._db)
     self.log = logging.getLogger("VIAF Client Test")
Ejemplo n.º 11
0
class TestVIAFClient(DatabaseTest):
    def setup(self):
        super(TestVIAFClient, self).setup()
        self.client = VIAFClient(self._db)
        self.log = logging.getLogger("VIAF Client Test")

    def sample_data(self, filename):
        return sample_data(filename, "viaf")

    def queue_file_in_mock_http(self, filename):
        h = DummyHTTPClient()
        xml = self.sample_data(filename)
        h.queue_response(200, media_type='text/xml', content=xml)
        return h

    def test_process_contributor(self):
        client = MockVIAFClientLookup(self._db, self.log)
        contributor = self._contributor()[0]

        # If lookup returns an empty array (as in the case of
        # VIAFParser#parse_multiple), the contributor is not updated.
        client.queue_lookup([])
        client.process_contributor(contributor)
        eq_(contributor.sort_name, '2001')
        eq_(contributor.display_name, None)

        def queue_lookup_result():
            http = self.queue_file_in_mock_http("mindy_kaling.xml")
            lookup = self.client.lookup_by_viaf(viaf="9581122",
                                                do_get=http.do_get)
            client.results = [lookup]

        # When lookup is successful, the contributor is updated.
        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(contributor.sort_name, "Kaling, Mindy")
        eq_(contributor.display_name, "Mindy Kaling")

        # If a contributor with the same VIAF number already exists,
        # the original contributor will be updated with VIAF data
        # and the processed contributor will be merged into the original.
        earliest_contributor = contributor
        # Reset the contributors sort name to confirm the data update.
        earliest_contributor.sort_name = None

        # Create a new contributor and contribution to confirm the merge.
        contributor = self._contributor()[0]
        edition = self._edition(authors=contributor.sort_name)
        eq_(edition.contributors, set([contributor]))

        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(earliest_contributor.sort_name, "Kaling, Mindy")
        eq_(edition.contributors, set([earliest_contributor]))
        # The new contributor has been deleted.
        assert contributor not in self._db

        # If the display name of the original contributor is suspiciously
        # different from the VIAF display name, the new contributor will be
        # updated without being merged.
        earliest_contributor.display_name = "Mindy L. Kaling"
        earliest_contributor.sort_name = None
        contributor = self._contributor()[0]
        edition = self._edition(authors=contributor.sort_name)

        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(contributor.viaf, "9581122")
        eq_(contributor.sort_name, "Kaling, Mindy")
        # Earlier contributor has not been updated or merged.
        eq_(earliest_contributor.sort_name, None)
        assert earliest_contributor not in edition.contributors

    def test_lookup_by_viaf(self):
        # there can be one and only one Mindy
        h = self.queue_file_in_mock_http("mindy_kaling.xml")

        contributor_candidate = self.client.lookup_by_viaf(viaf="9581122",
                                                           do_get=h.do_get)
        (selected_candidate, match_confidences,
         contributor_titles) = contributor_candidate
        eq_(selected_candidate.viaf, "9581122")
        eq_(selected_candidate.sort_name, "Kaling, Mindy")

    def test_lookup_by_name(self):
        # there can be one and only one Mindy
        h = self.queue_file_in_mock_http("mindy_kaling.xml")

        (selected_candidate, match_confidences,
         contributor_titles) = self.client.lookup_by_name(
             sort_name="Mindy Kaling", do_get=h.do_get)
        eq_(selected_candidate.viaf, "9581122")
        eq_(selected_candidate.sort_name, "Kaling, Mindy")
Ejemplo n.º 12
0
class LinkedDataCoverageProvider(IdentifierCoverageProvider):
    """Runs Editions obtained from OCLC Lookup through OCLC Linked Data.

    This (maybe) associates a edition with a (potentially) large
    number of ISBNs, which can be used as input into other services.
    """

    SERVICE_NAME = "OCLC Linked Data Coverage Provider"
    DEFAULT_BATCH_SIZE = 10

    DATA_SOURCE_NAME = DataSource.OCLC_LINKED_DATA
    INPUT_IDENTIFIER_TYPES = [
        Identifier.OCLC_WORK, Identifier.OCLC_NUMBER, Identifier.ISBN,
        Identifier.OVERDRIVE_ID
    ]

    def __init__(self, _db, *args, **kwargs):
        if 'api' in kwargs:
            self.api = kwargs['api']
            del kwargs['api']
        else:
            self.api = OCLCLinkedData(_db)
        if 'viaf_api' in kwargs:
            self.viaf = kwargs['viaf_api']
            del kwargs['viaf_api']
        else:
            self.viaf = VIAFClient(_db)
        super(LinkedDataCoverageProvider, self).__init__(_db, *args, **kwargs)

    def process_item(self, identifier):
        try:
            new_info_counter = Counter()
            self.log.info("Processing identifier %r", identifier)
            metadatas = [m for m in self.api.info_for(identifier)]

            if identifier.type == Identifier.ISBN:
                # Currently info_for seeks the results of OCLC Work IDs only
                # This segment will get the metadata of any equivalent OCLC Numbers
                # as well.
                equivalents = Identifier.recursively_equivalent_identifier_ids(
                    self._db, [identifier.id])
                oclc_numbers = self._db.query(Identifier).\
                    filter(Identifier.id.in_(equivalents)).\
                    filter(Identifier.type==Identifier.OCLC_NUMBER).all()
                for oclc_number in oclc_numbers:
                    more_metadata = [m for m in self.api.info_for(oclc_number)]
                    metadatas += more_metadata
                    metadatas = [m for m in metadatas if m]

            for metadata in metadatas:
                other_identifier, ignore = metadata.primary_identifier.load(
                    self._db)
                oclc_editions = other_identifier.primarily_identifies

                # Keep track of the number of editions OCLC associates
                # with this identifier.
                other_identifier.add_measurement(
                    self.data_source, Measurement.PUBLISHED_EDITIONS,
                    len(oclc_editions))

                # Clean up contributor information.
                self.apply_viaf_to_contributor_data(metadata)
                # Remove any empty ContributorData objects that may have
                # been created.
                metadata.contributors = filter(
                    lambda c: c.sort_name or c.display_name,
                    metadata.contributors)

                # When metadata is applied, it must be given a client that can
                # response to 'canonicalize_author_name'. Usually this is an
                # OPDSImporter that reaches out to the Metadata Wrangler, but
                # in the case of being _on_ the Metadata Wrangler...:
                from canonicalize import AuthorNameCanonicalizer
                metadata_client = AuthorNameCanonicalizer(self._db,
                                                          oclcld=self.api,
                                                          viaf=self.viaf)

                num_new_isbns = self.new_isbns(metadata)
                new_info_counter['isbns'] += num_new_isbns
                if oclc_editions:
                    # There are existing OCLC editions. Apply any new information to them.
                    for edition in oclc_editions:
                        metadata, new_info_counter = self.apply_metadata_to_edition(
                            edition, metadata, metadata_client,
                            new_info_counter)
                else:
                    # Create a new OCLC edition to hold the information.
                    edition, ignore = get_one_or_create(
                        self._db,
                        Edition,
                        data_source=self.data_source,
                        primary_identifier=other_identifier)
                    metadata, new_info_counter = self.apply_metadata_to_edition(
                        edition, metadata, metadata_client, new_info_counter)
                    # Set the new OCLC edition's identifier equivalent to this
                    # identifier so we know they're related.
                    self.set_equivalence(identifier, metadata)

                self.log.info(
                    "Total: %(editions)d editions, %(isbns)d ISBNs, "\
                    "%(descriptions)d descriptions, %(subjects)d classifications.",
                    new_info_counter
                )
        except IOError as e:
            if ", but couldn't find location" in e.message:
                exception = "OCLC doesn't know about this ISBN: %r" % e
                transient = False
            else:
                exception = "OCLC raised an error: %r" % e
                transient = True
            return self.failure(identifier, exception, transient=transient)
        return identifier

    def apply_viaf_to_contributor_data(self, metadata):
        """Looks up VIAF information for contributors identified by OCLC

        This is particularly crucial for contributors identified solely
        by VIAF IDs (and no sort_name), as it raises errors later in the
        process.
        """
        for contributor_data in metadata.contributors:
            if contributor_data.viaf:
                viaf_contributor_data = self.viaf.lookup_by_viaf(
                    contributor_data.viaf,
                    working_sort_name=contributor_data.sort_name,
                    working_display_name=contributor_data.display_name)[0]
                if viaf_contributor_data:
                    viaf_contributor_data.apply(contributor_data)

    def apply_metadata_to_edition(self, edition, metadata, metadata_client,
                                  counter):
        """Applies metadata and increments counters"""
        metadata.apply(edition,
                       collection=None,
                       metadata_client=metadata_client)
        counter['editions'] += 1
        counter['descriptions'] += len(metadata.links)
        counter['subjects'] += len(metadata.subjects)

        return metadata, counter

    def new_isbns(self, metadata):
        """Returns the number of new isbns on a metadata object"""

        new_isbns = 0
        for identifier_data in metadata.identifiers:
            identifier, new = identifier_data.load(self._db)
            if new:
                new_isbns += 1
        return new_isbns

    def set_equivalence(self, identifier, metadata):
        """Identify the OCLC Number with the OCLC Work"""

        primary_editions = identifier.primarily_identifies
        if primary_editions:
            strength = 0
            for primary_edition in primary_editions:
                if metadata.title:
                    title_strength = MetadataSimilarity.title_similarity(
                        metadata.title, primary_edition.title)
                else:
                    title_strength = 0
                edition_viafs = set(
                    [c.viaf for c in primary_edition.contributors if c.viaf])
                metadata_viafs = set(
                    [c.viaf for c in metadata.contributors if c.viaf])
                author_strength = MetadataSimilarity._proportion(
                    edition_viafs, metadata_viafs)
                edition_strength = (title_strength * 0.8) + (author_strength *
                                                             0.2)
                if edition_strength > strength:
                    strength = edition_strength
        else:
            strength = 1

        if strength > 0:
            primary_identifier, ignore = metadata.primary_identifier.load(
                self._db)
            identifier.equivalent_to(self.data_source, primary_identifier,
                                     strength)
from pdb import set_trace
import os
import sys
bin_dir = os.path.split(__file__)[0]
package_dir = os.path.join(bin_dir, "..")
sys.path.append(os.path.abspath(package_dir))

import re
from core.model import (
    production_session, 
    Contributor, 
)
from viaf import VIAFClient

_db = production_session()
viaf_client = VIAFClient(_db)
from sqlalchemy.sql import text
contributors = _db.query(Contributor).filter(
    text("contributors.display_name ~ '^Q[0-9]'")
).order_by(Contributor.id)
print contributors.count()
for contributor in contributors:
    if contributor.viaf:
        viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_viaf(contributor.viaf)
    else:
        viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_name(contributor.name)
    print "%s: %s => %s, %s => %s" % (
        contributor.id, 
        contributor.display_name, display_name,
        contributor.wikipedia_name, wikipedia_name
    )
Ejemplo n.º 14
0
class RedoOCLCForThreeMScript(Script):
    def __init__(self, test_session=None):
        # Allows tests to run without db session overlap.
        if test_session:
            self._session = test_session
        self.coverage = LinkedDataCoverageProvider(self._db)
        self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
        self.viaf = VIAFClient(self._db)

    @property
    def input_data_source(self):
        return DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)

    def do_run(self):
        """Re-runs OCLC Linked Data coverage provider to get viafs. Fetches
        author information and recalculates presentation."""
        identifiers = self.fetch_authorless_threem_identifiers()
        self.delete_coverage_records(identifiers)
        self.ensure_isbn_identifier(identifiers)
        for identifier in identifiers:
            self.coverage.ensure_coverage(identifier)
            self.merge_contributors(identifier)
            # Recalculate everything so the contributors can be seen.
            for contributor in identifier.primary_edition.contributors:
                self.viaf.process_contributor(contributor)
            identifier.primary_edition.calculate_presentation()
            if identifier.licensed_through:
                identifier.licensed_through.calculate_work()

    def fetch_authorless_threem_identifiers(self):
        """Returns a list of ThreeM identifiers that don't have contributors"""
        qu = self._db.query(Identifier).join(Identifier.primarily_identifies)
        qu = qu.outerjoin(
            Edition.contributions).filter(Contribution.id == None)
        qu = qu.filter(Identifier.type == Identifier.THREEM_ID)
        return qu.all()

    def delete_coverage_records(self, identifiers):
        """Deletes existing OCLC Linked Data coverage records to re-run and
        capture author data"""
        t1 = self._db.begin_nested()

        for identifier in identifiers:
            for coverage_record in identifier.coverage_records:
                if coverage_record.data_source == self.input_data_source:
                    self._db.delete(coverage_record)

        t1.commit()

    def ensure_isbn_identifier(self, identifiers):
        """Runs OCLCClassify to get ISBN numbers if they're not available."""
        identifiers_without_isbn = []
        for identifier in identifiers:
            equivalencies = identifier.equivalencies
            equivalent_types = [eq.output.type for eq in equivalencies]
            if Identifier.ISBN not in equivalent_types:
                identifiers_without_isbn.append(identifier)

        for identifier in identifiers_without_isbn:
            self.oclc_classify.ensure_coverage(identifier)

    def merge_contributors(self, identifier):
        """Gives a ThreeM primary edition any contributors found via OCLC-LD"""
        qu = self._db.query(Identifier).join(Identifier.inbound_equivalencies)
        qu = qu.filter(
            or_(Identifier.type == Identifier.OCLC_WORK,
                Identifier.type == Identifier.OCLC_NUMBER)).filter(
                    Equivalency.input_id == identifier.id)

        oclc_contributions = []
        for oclc_identifier in qu.all():
            editions = oclc_identifier.primarily_identifies
            for edition in editions:
                oclc_contributions += edition.contributions

        for contribution in oclc_contributions:
            for edition in identifier.primarily_identifies:
                edition.add_contributor(contribution.contributor,
                                        contribution.role)
Ejemplo n.º 15
0
class IdentifierResolutionCoverageProvider(CatalogCoverageProvider):
    """Make sure all Identifiers associated with some Collection become
    Works.

    Coverage happens by running the Identifier through _other_
    CoverageProviders, which fill in the blanks with data from
    third-party entities.

    This CoverageProvider may force those other CoverageProviders to
    do their work for each Identifier immediately, or it may simply
    register its Identifiers with those CoverageProviders and allow
    them to complete the work at their own pace.

    Unlike most CoverageProviders, which are invoked from a script,
    this CoverageProvider is invoked from
    URNLookupController.process_urns, and only when a client expresses
    a desire that we look into a specific identifier.
    """

    SERVICE_NAME = "Identifier Resolution Coverage Provider"
    DATA_SOURCE_NAME = DataSource.INTERNAL_PROCESSING

    # These are the only identifier types we have any hope of providing
    # insight into.
    INPUT_IDENTIFIER_TYPES = [
        Identifier.OVERDRIVE_ID,
        Identifier.ISBN,
        Identifier.URI,
    ]
    OPERATION = CoverageRecord.RESOLVE_IDENTIFIER_OPERATION

    # We cover all Collections, regardless of their protocol.
    PROTOCOL = None

    def __init__(self,
                 collection,
                 mirror=None,
                 http_get=None,
                 viaf=None,
                 provide_coverage_immediately=False,
                 force=False,
                 provider_kwargs=None,
                 **kwargs):
        """Constructor.

        :param collection: Handle all Identifiers from this Collection
        that were previously registered with this CoverageProvider.

        :param mirror: A MirrorUploader to use if coverage requires
        uploading any cover images to external storage.

        :param http_get: A drop-in replacement for
        Representation.simple_http_get, to be used if any information
        (such as a book cover) needs to be obtained from the public
        Internet.

        :param viaf_client: A VIAFClient to use if coverage requires
        gathering information about authors from VIAF.

        :param force: Force CoverageProviders to cover identifiers
        even if they believe they have already done the work.

        :param provide_coverage_immediately: If this is True, then
        resolving an identifier means registering it with all of its
        other CoverageProviders *and then attempting to provide
        coverage*.  Registration is considered a success even if the
        other CoverageProviders fail, but the attempt must be made
        immediately.

        If this is False (the default), then resolving an identifier
        just means registering it with all other relevant
        CoverageProviders.

        :param provider_kwargs: Pass this object in as provider_kwargs
        when calling gather_providers at the end of the
        constructor. Used only in testing.

        """
        _db = Session.object_session(collection)

        # Since we are the metadata wrangler, any resources we find,
        # we mirror using the sitewide MirrorUploader.
        if not mirror:
            try:
                mirror = MirrorUploader.sitewide(_db)
            except CannotLoadConfiguration, e:
                logging.error(
                    "No storage integration is configured. Cover images will not be stored anywhere.",
                    exc_info=e)
        self.mirror = mirror

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        presentation = PresentationCalculationPolicy(
            regenerate_opds_entries=True)
        replacement_policy = ReplacementPolicy.from_metadata_source(
            presentation_calculation_policy=presentation,
            mirror=self.mirror,
            http_get=http_get,
        )
        super(IdentifierResolutionCoverageProvider,
              self).__init__(collection,
                             replacement_policy=replacement_policy,
                             **kwargs)

        self.provide_coverage_immediately = provide_coverage_immediately
        self.force = force or provide_coverage_immediately

        self.viaf = viaf or VIAFClient(self._db)

        # Instantiate the coverage providers that may be needed to
        # relevant to any given Identifier.
        #
        # Each Identifier in this Collection's catalog will be registered
        # with all relevant providers (if provide_coverage_immediately
        # is False) or immediately covered by all relevant providers
        # (if provide_coverage_immediately is True).
        self.providers = self.gather_providers(provider_kwargs)
Ejemplo n.º 16
0
class TestVIAFClient(DatabaseTest):

    def setup(self):
        super(TestVIAFClient, self).setup()
        self.client = VIAFClient(self._db)
        self.log = logging.getLogger("VIAF Client Test")

    def sample_data(self, filename):
        return sample_data(filename, "viaf")

    def queue_file_in_mock_http(self, filename):
        h = DummyHTTPClient()
        xml = self.sample_data(filename)
        h.queue_response(200, media_type='text/xml', content=xml)
        return h

    def test_process_contributor(self):
        client = MockVIAFClientLookup(self._db, self.log)
        contributor = self._contributor()[0]

        # If lookup returns an empty array (as in the case of
        # VIAFParser#parse_multiple), the contributor is not updated.
        client.queue_lookup([])
        client.process_contributor(contributor)
        eq_(contributor.sort_name, '2001')
        eq_(contributor.display_name, None)

        def queue_lookup_result():
            http = self.queue_file_in_mock_http("mindy_kaling.xml")
            lookup = self.client.lookup_by_viaf(viaf="9581122", do_get=http.do_get)
            client.results = [lookup]

        # When lookup is successful, the contributor is updated.
        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(contributor.sort_name, "Kaling, Mindy")
        eq_(contributor.display_name, "Mindy Kaling")

        # If a contributor with the same VIAF number already exists,
        # the original contributor will be updated with VIAF data
        # and the processed contributor will be merged into the original.
        earliest_contributor = contributor
        # Reset the contributors sort name to confirm the data update.
        earliest_contributor.sort_name = None

        # Create a new contributor and contribution to confirm the merge.
        contributor = self._contributor()[0]
        edition = self._edition(authors=contributor.sort_name)
        eq_(edition.contributors, set([contributor]))

        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(earliest_contributor.sort_name, "Kaling, Mindy")
        eq_(edition.contributors, set([earliest_contributor]))
        # The new contributor has been deleted.
        assert contributor not in self._db

        # If the display name of the original contributor is suspiciously
        # different from the VIAF display name, the new contributor will be
        # updated without being merged.
        earliest_contributor.display_name = "Mindy L. Kaling"
        earliest_contributor.sort_name = None
        contributor = self._contributor()[0]
        edition = self._edition(authors=contributor.sort_name)

        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(contributor.viaf, "9581122")
        eq_(contributor.sort_name, "Kaling, Mindy")
        # Earlier contributor has not been updated or merged.
        eq_(earliest_contributor.sort_name, None)
        assert earliest_contributor not in edition.contributors

    def test_lookup_by_viaf(self):
        # there can be one and only one Mindy
        h = self.queue_file_in_mock_http("mindy_kaling.xml")

        contributor_candidate = self.client.lookup_by_viaf(viaf="9581122", do_get=h.do_get)
        (selected_candidate, match_confidences, contributor_titles) = contributor_candidate
        eq_(selected_candidate.viaf, "9581122")
        eq_(selected_candidate.sort_name, "Kaling, Mindy")

    def test_lookup_by_name(self):
        # there can be one and only one Mindy
        h = self.queue_file_in_mock_http("mindy_kaling.xml")

        (selected_candidate,
         match_confidences,
         contributor_titles) = self.client.lookup_by_name(sort_name="Mindy Kaling", do_get=h.do_get)
        eq_(selected_candidate.viaf, "9581122")
        eq_(selected_candidate.sort_name, "Kaling, Mindy")
Ejemplo n.º 17
0
 def setup(self):
     super(TestVIAFClient, self).setup()
     self.client = VIAFClient(self._db)
     self.log = logging.getLogger("VIAF Client Test")
Ejemplo n.º 18
0
from pdb import set_trace
import os
import sys
bin_dir = os.path.split(__file__)[0]
package_dir = os.path.join(bin_dir, "..")
sys.path.append(os.path.abspath(package_dir))

import re
from core.model import (
    production_session,
    Contributor,
)
from viaf import VIAFClient

_db = production_session()
viaf_client = VIAFClient(_db)
from sqlalchemy.sql import text
contributors = _db.query(Contributor).filter(
    text("contributors.display_name ~ '^Q[0-9]'")).order_by(Contributor.id)
print contributors.count()
for contributor in contributors:
    if contributor.viaf:
        viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_viaf(
            contributor.viaf)
    else:
        viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_name(
            contributor.name)
    print "%s: %s => %s, %s => %s" % (contributor.id, contributor.display_name,
                                      display_name, contributor.wikipedia_name,
                                      wikipedia_name)
    contributor.display_name = display_name
Ejemplo n.º 19
0
 def __init__(self, _db, oclcld=None, viaf=None):
     self._db = _db
     self.oclcld = oclcld or OCLCLinkedData(_db)
     self.viaf = viaf or VIAFClient(_db)
     self.log = logging.getLogger("Author name canonicalizer")
Ejemplo n.º 20
0
class RedoOCLCForThreeMScript(Script):

    def __init__(self, test_session=None):
        # Allows tests to run without db session overlap.
        if test_session:
            self._session = test_session
        self.coverage = LinkedDataCoverageProvider(self._db)
        self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
        self.viaf = VIAFClient(self._db)

    @property
    def input_data_source(self):
        return DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)

    def do_run(self):
        """Re-runs OCLC Linked Data coverage provider to get viafs. Fetches
        author information and recalculates presentation."""
        identifiers = self.fetch_authorless_threem_identifiers()
        self.delete_coverage_records(identifiers)
        self.ensure_isbn_identifier(identifiers)
        for identifier in identifiers:
            self.coverage.ensure_coverage(identifier)
            self.merge_contributors(identifier)
            # Recalculate everything so the contributors can be seen.
            for contributor in identifier.primary_edition.contributors:
                self.viaf.process_contributor(contributor)
            identifier.primary_edition.calculate_presentation()
            if identifier.licensed_through:
                identifier.licensed_through.calculate_work()

    def fetch_authorless_threem_identifiers(self):
        """Returns a list of ThreeM identifiers that don't have contributors"""
        qu = self._db.query(Identifier).join(Identifier.primarily_identifies)
        qu = qu.outerjoin(Edition.contributions).filter(Contribution.id==None)
        qu = qu.filter(Identifier.type == Identifier.THREEM_ID)
        return qu.all()

    def delete_coverage_records(self, identifiers):
        """Deletes existing OCLC Linked Data coverage records to re-run and
        capture author data"""
        t1 = self._db.begin_nested()

        for identifier in identifiers:
            for coverage_record in identifier.coverage_records:
                if coverage_record.data_source == self.input_data_source:
                    self._db.delete(coverage_record)

        t1.commit()

    def ensure_isbn_identifier(self, identifiers):
        """Runs OCLCClassify to get ISBN numbers if they're not available."""
        identifiers_without_isbn = []
        for identifier in identifiers:
            equivalencies = identifier.equivalencies
            equivalent_types = [eq.output.type for eq in equivalencies]
            if Identifier.ISBN not in equivalent_types:
                identifiers_without_isbn.append(identifier)

        for identifier in identifiers_without_isbn:
            self.oclc_classify.ensure_coverage(identifier)

    def merge_contributors(self, identifier):
        """Gives a ThreeM primary edition any contributors found via OCLC-LD"""
        qu = self._db.query(Identifier).join(Identifier.inbound_equivalencies)
        qu = qu.filter(or_(
            Identifier.type == Identifier.OCLC_WORK,
            Identifier.type == Identifier.OCLC_NUMBER
        )).filter(Equivalency.input_id == identifier.id)

        oclc_contributions = []
        for oclc_identifier in qu.all():
            editions = oclc_identifier.primarily_identifies
            for edition in editions:
                oclc_contributions += edition.contributions

        for contribution in oclc_contributions:
            for edition in identifier.primarily_identifies:
                edition.add_contributor(contribution.contributor, contribution.role)