class TestVIAFClient(DatabaseTest):

    def setup(self):
        super(TestVIAFClient, self).setup()
        self.client = VIAFClient(self._db)
        self.log = logging.getLogger("VIAF Client Test")

    def sample_data(self, filename):
        return sample_data(filename, "viaf")

    def queue_file_in_mock_http(self, filename):
        h = DummyHTTPClient()
        xml = self.sample_data(filename)
        h.queue_response(200, media_type='text/xml', content=xml)
        return h

    def test_process_contributor(self):
        client = MockVIAFClientLookup(self._db, self.log)
        contributor = self._contributor()[0]

        # If lookup returns an empty array (as in the case of
        # VIAFParser#parse_multiple), the contributor is not updated.
        client.queue_lookup([])
        client.process_contributor(contributor)
        eq_(contributor.sort_name, '2001')
        eq_(contributor.display_name, None)

        def queue_lookup_result():
            http = self.queue_file_in_mock_http("mindy_kaling.xml")
            lookup = self.client.lookup_by_viaf(viaf="9581122", do_get=http.do_get)
            client.results = [lookup]

        # When lookup is successful, the contributor is updated.
        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(contributor.sort_name, "Kaling, Mindy")
        eq_(contributor.display_name, "Mindy Kaling")

        # If a contributor with the same VIAF number already exists,
        # the original contributor will be updated with VIAF data
        # and the processed contributor will be merged into the original.
        earliest_contributor = contributor
        # Reset the contributors sort name to confirm the data update.
        earliest_contributor.sort_name = None

        # Create a new contributor and contribution to confirm the merge.
        contributor = self._contributor()[0]
        edition = self._edition(authors=contributor.sort_name)
        eq_(edition.contributors, set([contributor]))

        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(earliest_contributor.sort_name, "Kaling, Mindy")
        eq_(edition.contributors, set([earliest_contributor]))
        # The new contributor has been deleted.
        assert contributor not in self._db

        # If the display name of the original contributor is suspiciously
        # different from the VIAF display name, the new contributor will be
        # updated without being merged.
        earliest_contributor.display_name = "Mindy L. Kaling"
        earliest_contributor.sort_name = None
        contributor = self._contributor()[0]
        edition = self._edition(authors=contributor.sort_name)

        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(contributor.viaf, "9581122")
        eq_(contributor.sort_name, "Kaling, Mindy")
        # Earlier contributor has not been updated or merged.
        eq_(earliest_contributor.sort_name, None)
        assert earliest_contributor not in edition.contributors

    def test_lookup_by_viaf(self):
        # there can be one and only one Mindy
        h = self.queue_file_in_mock_http("mindy_kaling.xml")

        contributor_candidate = self.client.lookup_by_viaf(viaf="9581122", do_get=h.do_get)
        (selected_candidate, match_confidences, contributor_titles) = contributor_candidate
        eq_(selected_candidate.viaf, "9581122")
        eq_(selected_candidate.sort_name, "Kaling, Mindy")

    def test_lookup_by_name(self):
        # there can be one and only one Mindy
        h = self.queue_file_in_mock_http("mindy_kaling.xml")

        (selected_candidate,
         match_confidences,
         contributor_titles) = self.client.lookup_by_name(sort_name="Mindy Kaling", do_get=h.do_get)
        eq_(selected_candidate.viaf, "9581122")
        eq_(selected_candidate.sort_name, "Kaling, Mindy")
Esempio n. 2
0
class TestVIAFClient(DatabaseTest):
    def setup(self):
        super(TestVIAFClient, self).setup()
        self.client = VIAFClient(self._db)
        self.log = logging.getLogger("VIAF Client Test")

    def sample_data(self, filename):
        return sample_data(filename, "viaf")

    def queue_file_in_mock_http(self, filename):
        h = DummyHTTPClient()
        xml = self.sample_data(filename)
        h.queue_response(200, media_type='text/xml', content=xml)
        return h

    def test_process_contributor(self):
        client = MockVIAFClientLookup(self._db, self.log)
        contributor = self._contributor()[0]

        # If lookup returns an empty array (as in the case of
        # VIAFParser#parse_multiple), the contributor is not updated.
        client.queue_lookup([])
        client.process_contributor(contributor)
        eq_(contributor.sort_name, '2001')
        eq_(contributor.display_name, None)

        def queue_lookup_result():
            http = self.queue_file_in_mock_http("mindy_kaling.xml")
            lookup = self.client.lookup_by_viaf(viaf="9581122",
                                                do_get=http.do_get)
            client.results = [lookup]

        # When lookup is successful, the contributor is updated.
        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(contributor.sort_name, "Kaling, Mindy")
        eq_(contributor.display_name, "Mindy Kaling")

        # If a contributor with the same VIAF number already exists,
        # the original contributor will be updated with VIAF data
        # and the processed contributor will be merged into the original.
        earliest_contributor = contributor
        # Reset the contributors sort name to confirm the data update.
        earliest_contributor.sort_name = None

        # Create a new contributor and contribution to confirm the merge.
        contributor = self._contributor()[0]
        edition = self._edition(authors=contributor.sort_name)
        eq_(edition.contributors, set([contributor]))

        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(earliest_contributor.sort_name, "Kaling, Mindy")
        eq_(edition.contributors, set([earliest_contributor]))
        # The new contributor has been deleted.
        assert contributor not in self._db

        # If the display name of the original contributor is suspiciously
        # different from the VIAF display name, the new contributor will be
        # updated without being merged.
        earliest_contributor.display_name = "Mindy L. Kaling"
        earliest_contributor.sort_name = None
        contributor = self._contributor()[0]
        edition = self._edition(authors=contributor.sort_name)

        queue_lookup_result()
        client.process_contributor(contributor)
        eq_(contributor.viaf, "9581122")
        eq_(contributor.sort_name, "Kaling, Mindy")
        # Earlier contributor has not been updated or merged.
        eq_(earliest_contributor.sort_name, None)
        assert earliest_contributor not in edition.contributors

    def test_lookup_by_viaf(self):
        # there can be one and only one Mindy
        h = self.queue_file_in_mock_http("mindy_kaling.xml")

        contributor_candidate = self.client.lookup_by_viaf(viaf="9581122",
                                                           do_get=h.do_get)
        (selected_candidate, match_confidences,
         contributor_titles) = contributor_candidate
        eq_(selected_candidate.viaf, "9581122")
        eq_(selected_candidate.sort_name, "Kaling, Mindy")

    def test_lookup_by_name(self):
        # there can be one and only one Mindy
        h = self.queue_file_in_mock_http("mindy_kaling.xml")

        (selected_candidate, match_confidences,
         contributor_titles) = self.client.lookup_by_name(
             sort_name="Mindy Kaling", do_get=h.do_get)
        eq_(selected_candidate.viaf, "9581122")
        eq_(selected_candidate.sort_name, "Kaling, Mindy")
Esempio n. 3
0
class LinkedDataCoverageProvider(IdentifierCoverageProvider):
    """Runs Editions obtained from OCLC Lookup through OCLC Linked Data.

    This (maybe) associates a edition with a (potentially) large
    number of ISBNs, which can be used as input into other services.
    """

    SERVICE_NAME = "OCLC Linked Data Coverage Provider"
    DEFAULT_BATCH_SIZE = 10

    DATA_SOURCE_NAME = DataSource.OCLC_LINKED_DATA
    INPUT_IDENTIFIER_TYPES = [
        Identifier.OCLC_WORK, Identifier.OCLC_NUMBER, Identifier.ISBN,
        Identifier.OVERDRIVE_ID
    ]

    def __init__(self, _db, *args, **kwargs):
        if 'api' in kwargs:
            self.api = kwargs['api']
            del kwargs['api']
        else:
            self.api = OCLCLinkedData(_db)
        if 'viaf_api' in kwargs:
            self.viaf = kwargs['viaf_api']
            del kwargs['viaf_api']
        else:
            self.viaf = VIAFClient(_db)
        super(LinkedDataCoverageProvider, self).__init__(_db, *args, **kwargs)

    def process_item(self, identifier):
        try:
            new_info_counter = Counter()
            self.log.info("Processing identifier %r", identifier)
            metadatas = [m for m in self.api.info_for(identifier)]

            if identifier.type == Identifier.ISBN:
                # Currently info_for seeks the results of OCLC Work IDs only
                # This segment will get the metadata of any equivalent OCLC Numbers
                # as well.
                equivalents = Identifier.recursively_equivalent_identifier_ids(
                    self._db, [identifier.id])
                oclc_numbers = self._db.query(Identifier).\
                    filter(Identifier.id.in_(equivalents)).\
                    filter(Identifier.type==Identifier.OCLC_NUMBER).all()
                for oclc_number in oclc_numbers:
                    more_metadata = [m for m in self.api.info_for(oclc_number)]
                    metadatas += more_metadata
                    metadatas = [m for m in metadatas if m]

            for metadata in metadatas:
                other_identifier, ignore = metadata.primary_identifier.load(
                    self._db)
                oclc_editions = other_identifier.primarily_identifies

                # Keep track of the number of editions OCLC associates
                # with this identifier.
                other_identifier.add_measurement(
                    self.data_source, Measurement.PUBLISHED_EDITIONS,
                    len(oclc_editions))

                # Clean up contributor information.
                self.apply_viaf_to_contributor_data(metadata)
                # Remove any empty ContributorData objects that may have
                # been created.
                metadata.contributors = filter(
                    lambda c: c.sort_name or c.display_name,
                    metadata.contributors)

                # When metadata is applied, it must be given a client that can
                # response to 'canonicalize_author_name'. Usually this is an
                # OPDSImporter that reaches out to the Metadata Wrangler, but
                # in the case of being _on_ the Metadata Wrangler...:
                from canonicalize import AuthorNameCanonicalizer
                metadata_client = AuthorNameCanonicalizer(self._db,
                                                          oclcld=self.api,
                                                          viaf=self.viaf)

                num_new_isbns = self.new_isbns(metadata)
                new_info_counter['isbns'] += num_new_isbns
                if oclc_editions:
                    # There are existing OCLC editions. Apply any new information to them.
                    for edition in oclc_editions:
                        metadata, new_info_counter = self.apply_metadata_to_edition(
                            edition, metadata, metadata_client,
                            new_info_counter)
                else:
                    # Create a new OCLC edition to hold the information.
                    edition, ignore = get_one_or_create(
                        self._db,
                        Edition,
                        data_source=self.data_source,
                        primary_identifier=other_identifier)
                    metadata, new_info_counter = self.apply_metadata_to_edition(
                        edition, metadata, metadata_client, new_info_counter)
                    # Set the new OCLC edition's identifier equivalent to this
                    # identifier so we know they're related.
                    self.set_equivalence(identifier, metadata)

                self.log.info(
                    "Total: %(editions)d editions, %(isbns)d ISBNs, "\
                    "%(descriptions)d descriptions, %(subjects)d classifications.",
                    new_info_counter
                )
        except IOError as e:
            if ", but couldn't find location" in e.message:
                exception = "OCLC doesn't know about this ISBN: %r" % e
                transient = False
            else:
                exception = "OCLC raised an error: %r" % e
                transient = True
            return self.failure(identifier, exception, transient=transient)
        return identifier

    def apply_viaf_to_contributor_data(self, metadata):
        """Looks up VIAF information for contributors identified by OCLC

        This is particularly crucial for contributors identified solely
        by VIAF IDs (and no sort_name), as it raises errors later in the
        process.
        """
        for contributor_data in metadata.contributors:
            if contributor_data.viaf:
                viaf_contributor_data = self.viaf.lookup_by_viaf(
                    contributor_data.viaf,
                    working_sort_name=contributor_data.sort_name,
                    working_display_name=contributor_data.display_name)[0]
                if viaf_contributor_data:
                    viaf_contributor_data.apply(contributor_data)

    def apply_metadata_to_edition(self, edition, metadata, metadata_client,
                                  counter):
        """Applies metadata and increments counters"""
        metadata.apply(edition,
                       collection=None,
                       metadata_client=metadata_client)
        counter['editions'] += 1
        counter['descriptions'] += len(metadata.links)
        counter['subjects'] += len(metadata.subjects)

        return metadata, counter

    def new_isbns(self, metadata):
        """Returns the number of new isbns on a metadata object"""

        new_isbns = 0
        for identifier_data in metadata.identifiers:
            identifier, new = identifier_data.load(self._db)
            if new:
                new_isbns += 1
        return new_isbns

    def set_equivalence(self, identifier, metadata):
        """Identify the OCLC Number with the OCLC Work"""

        primary_editions = identifier.primarily_identifies
        if primary_editions:
            strength = 0
            for primary_edition in primary_editions:
                if metadata.title:
                    title_strength = MetadataSimilarity.title_similarity(
                        metadata.title, primary_edition.title)
                else:
                    title_strength = 0
                edition_viafs = set(
                    [c.viaf for c in primary_edition.contributors if c.viaf])
                metadata_viafs = set(
                    [c.viaf for c in metadata.contributors if c.viaf])
                author_strength = MetadataSimilarity._proportion(
                    edition_viafs, metadata_viafs)
                edition_strength = (title_strength * 0.8) + (author_strength *
                                                             0.2)
                if edition_strength > strength:
                    strength = edition_strength
        else:
            strength = 1

        if strength > 0:
            primary_identifier, ignore = metadata.primary_identifier.load(
                self._db)
            identifier.equivalent_to(self.data_source, primary_identifier,
                                     strength)
Esempio n. 4
0
import re
from core.model import (
    production_session,
    Contributor,
)
from viaf import VIAFClient

_db = production_session()
viaf_client = VIAFClient(_db)
from sqlalchemy.sql import text
contributors = _db.query(Contributor).filter(
    text("contributors.display_name ~ '^Q[0-9]'")).order_by(Contributor.id)
print contributors.count()
for contributor in contributors:
    if contributor.viaf:
        viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_viaf(
            contributor.viaf)
    else:
        viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_name(
            contributor.name)
    print "%s: %s => %s, %s => %s" % (contributor.id, contributor.display_name,
                                      display_name, contributor.wikipedia_name,
                                      wikipedia_name)
    contributor.display_name = display_name
    contributor.wikipedia_name = wikipedia_name
    contributor.family_name = family_name
    viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_viaf(
        contributor.viaf)
    for contribution in contributor.contributions:
        edition = contribution.edition
        if edition.work:
            edition.work.calculate_presentation()
from core.model import (
    production_session, 
    Contributor, 
)
from viaf import VIAFClient

_db = production_session()
viaf_client = VIAFClient(_db)
from sqlalchemy.sql import text
contributors = _db.query(Contributor).filter(
    text("contributors.display_name ~ '^Q[0-9]'")
).order_by(Contributor.id)
print contributors.count()
for contributor in contributors:
    if contributor.viaf:
        viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_viaf(contributor.viaf)
    else:
        viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_name(contributor.name)
    print "%s: %s => %s, %s => %s" % (
        contributor.id, 
        contributor.display_name, display_name,
        contributor.wikipedia_name, wikipedia_name
    )
    contributor.display_name = display_name
    contributor.wikipedia_name = wikipedia_name
    contributor.family_name = family_name
    viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_viaf(contributor.viaf)
    for contribution in contributor.contributions:
        edition = contribution.edition
        if edition.work:
            edition.work.calculate_presentation()