Python Identifierの例、core.model.Identifier Pythonの例

コード例 #1

0

ファイルを表示

ファイル: 20150826-5-recalculate_work_presentation_for_childrens_books.py プロジェクト: NYPL-Simplified/server_core

 def process_work(self, work):
     primary_identifier_ids = [
         x.primary_identifier.id for x in work.editions]
     data = Identifier.recursively_equivalent_identifier_ids(
         self._db, primary_identifier_ids, 5, threshold=0.5)
     flattened_data = Identifier.flatten_identifier_ids(data)
     workgenres, work.fiction, work.audience, target_age = work.assign_genres(
         flattened_data)
     old_target_age = work.target_age
     work.target_age = NumericRange(*target_age)
     if work.target_age != old_target_age and work.target_age.lower != None:
         print "%r: %r->%r" % (work.title, old_target_age, work.target_age)

コード例 #2

0

ファイルを表示

ファイル: test_novelist.py プロジェクト: dguo/circulation

    def test_confirm_same_identifier(self):
        source = DataSource.lookup(self._db, DataSource.NOVELIST)
        identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, '84752928'
        )
        unmatched_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, '23781947'
        )
        metadata = Metadata(source, primary_identifier=identifier)
        match = Metadata(source, primary_identifier=identifier)
        mistake = Metadata(source, primary_identifier=unmatched_identifier)

        eq_(False, self.novelist._confirm_same_identifier([metadata, mistake]))
        eq_(True, self.novelist._confirm_same_identifier([metadata, match]))

コード例 #3

0

ファイルを表示

ファイル: opds.py プロジェクト: datalogics-tsmith/circulation

    def page(cls, _db, title, url, annotator=None,
             use_materialized_works=True):

        """Create a feed of content to preload on devices."""
        configured_content = Configuration.policy(Configuration.PRELOADED_CONTENT)

        identifiers = [Identifier.parse_urn(_db, urn)[0] for urn in configured_content]
        identifier_ids = [identifier.id for identifier in identifiers]

        if use_materialized_works:
            from core.model import MaterializedWork
            q = _db.query(MaterializedWork)
            q = q.filter(MaterializedWork.primary_identifier_id.in_(identifier_ids))

            # Avoid eager loading of objects that are contained in the 
            # materialized view.
            q = q.options(
                lazyload(MaterializedWork.license_pool, LicensePool.data_source),
                lazyload(MaterializedWork.license_pool, LicensePool.identifier),
                lazyload(MaterializedWork.license_pool, LicensePool.edition),
            )
        else:
            q = _db.query(Work).join(Work.primary_edition)
            q = q.filter(Edition.primary_identifier_id.in_(identifier_ids))

        works = q.all()
        feed = cls(_db, title, url, works, annotator)

        annotator.annotate_feed(feed, None)
        content = unicode(feed)
        return content

コード例 #4

0

ファイルを表示

ファイル: monitor.py プロジェクト: NYPL-Simplified/circulation

    def get_identifiers(self, url=None):
        """Pulls mapped identifiers from a feed of SimplifiedOPDSMessages."""
        response = self.get_response(url=url)
        feed = response.text

        etree_feed = etree.parse(StringIO(response.text))
        messages = self.importer.extract_messages(self.parser, etree_feed)

        urns = [m.urn for m in messages]
        identifiers_by_urn, _failures = Identifier.parse_urns(
            self._db, urns, autocreate=False
        )
        urns = identifiers_by_urn.keys()
        identifiers = identifiers_by_urn.values()

        self.importer.build_identifier_mapping(urns)
        mapped_identifiers = list()
        for identifier in identifiers:
            mapped_identifier = self.importer.identifier_mapping.get(
                identifier, identifier
            )
            mapped_identifiers.append(mapped_identifier)

        parsed_feed = feedparser.parse(feed)
        next_links = self.importer.extract_next_links(parsed_feed)
        return mapped_identifiers, next_links

コード例 #5

0

ファイルを表示

 def process_urn(self, urn, collection=None, **kwargs):
     """Turn a URN into a Work suitable for use in an OPDS feed.
     """
     try:
         identifier, is_new = Identifier.parse_urn(self._db, urn)
     except ValueError, e:
         identifier = None

コード例 #6

0

ファイルを表示

ファイル: linked_data.py プロジェクト: NYPL-Simplified/metadata_wrangler

    def oclc_number_for_isbn(self, isbn):
        """Turn an ISBN identifier into an OCLC Number identifier."""

        # Let's pretend any id can be an oclc id.
        oclc_number = isbn.identifier
        oclc_identifier, made_new = Identifier.for_foreign_id(
            self._db, Identifier.OCLC_NUMBER, oclc_number, autocreate=True)

        return oclc_identifier

コード例 #7

0

ファイルを表示

ファイル: test_monitor.py プロジェクト: NYPL-Simplified/circulation

    def test_run_once(self):
        # Setup authentication and Metadata Wrangler details.
        lp = self._licensepool(
            None, data_source_name=DataSource.BIBLIOTHECA,
            collection=self.collection
        )
        lp.identifier.type = Identifier.BIBLIOTHECA_ID
        isbn = Identifier.parse_urn(self._db, u'urn:isbn:9781594632556')[0]
        lp.identifier.equivalent_to(
            DataSource.lookup(self._db, DataSource.BIBLIOTHECA), isbn, 1
        )
        eq_([], lp.identifier.links)
        eq_([], lp.identifier.measurements)

        # Queue some data to be found.
        responses = (
            'metadata_updates_response.opds',
            'metadata_updates_empty_response.opds',
        )
        for filename in responses:
            data = sample_data(filename, 'opds')
            self.lookup.queue_response(
                200, {'content-type' : OPDSFeed.ACQUISITION_FEED_TYPE}, data
            )

        timestamp = self.ts
        new_timestamp = self.monitor.run_once(timestamp)

        # We have a new value to use for the Monitor's timestamp -- the
        # earliest date seen in the last OPDS feed that contained
        # any entries.
        eq_(datetime.datetime(2016, 9, 20, 19, 37, 2), new_timestamp.finish)
        eq_("Editions processed: 1", new_timestamp.achievements)

        # Normally run_once() doesn't update the monitor's timestamp,
        # but this implementation does, so that work isn't redone if
        # run_once() crashes or the monitor is killed.
        eq_(new_timestamp.finish, self.monitor.timestamp().finish)

        # The original Identifier has information from the
        # mock Metadata Wrangler.
        mw_source = DataSource.lookup(self._db, DataSource.METADATA_WRANGLER)
        eq_(3, len(lp.identifier.links))
        [quality] = lp.identifier.measurements
        eq_(mw_source, quality.data_source)

        # Check the URLs we processed.
        url1, url2 = [x[0] for x in self.lookup.requests]

        # The first URL processed was the default one for the
        # MetadataWranglerOPDSLookup.
        eq_(self.lookup.get_collection_url(self.lookup.UPDATES_ENDPOINT), url1)

        # The second URL processed was whatever we saw in the 'next' link.
        eq_("http://next-link/", url2)

コード例 #8

0

ファイルを表示

ファイル: controller.py プロジェクト: NYPL-Simplified/metadata_wrangler

    def remove_items(self, collection_details):
        """Removes identifiers from a Collection's catalog"""
        client = authenticated_client_from_request(self._db)
        if isinstance(client, ProblemDetail):
            return client

        collection = collection_from_details(
            self._db, client, collection_details
        )

        urns = request.args.getlist('urn')
        messages = []
        identifiers_by_urn, failures = Identifier.parse_urns(self._db, urns)

        for urn in failures:
            message = OPDSMessage(
                urn, INVALID_URN.status_code, INVALID_URN.detail
            )
            messages.append(message)

        # Find the IDs of the subset of provided identifiers that are
        # in the catalog, so we know which ones to delete and give a
        # 200 message. Also get a SQLAlchemy clause that selects only
        # those IDs.
        matching_ids, identifier_match_clause = self._in_catalog_subset(
            collection, identifiers_by_urn
        )

        # Use that clause to delete all of the relevant catalog
        # entries.
        delete_stmt = collections_identifiers.delete().where(
            identifier_match_clause
        )
        self._db.execute(delete_stmt)

        # IDs that matched get a 200 message; all others get a 404
        # message.
        for urn, identifier in identifiers_by_urn.items():
            if identifier.id in matching_ids:
                status = HTTP_OK
                description = "Successfully removed"
            else:
                status = HTTP_NOT_FOUND
                description = "Not in catalog"
            message = OPDSMessage(urn, status, description)
            messages.append(message)

        title = "%s Catalog Item Removal for %s" % (collection.protocol, client.url)
        url = self.collection_feed_url("remove", collection, urn=urns)
        removal_feed = AcquisitionFeed(
            self._db, title, url, [], VerboseAnnotator,
            precomposed_entries=messages
        )

        return feed_response(removal_feed)

コード例 #9

0

ファイルを表示

ファイル: controller.py プロジェクト: NYPL-Simplified/metadata_wrangler

    def parse_identifier(self, urn):
        """Try to parse a URN into an identifier.

        :return: An Identifier if possible; otherwise None.
        """
        if not urn:
            return None
        try:
            result = Identifier.parse_urn(self._db, urn, False)
        except ValueError, e:
            # The identifier is parseable but invalid, e.g. an
            # ASIN used as an ISBN. Ignore it.
            return None

コード例 #10

0

ファイルを表示

ファイル: controller.py プロジェクト: NYPL-Simplified/metadata_wrangler

    def add_items(self, collection_details):
        """Adds identifiers to a Collection's catalog"""
        client = authenticated_client_from_request(self._db)
        if isinstance(client, ProblemDetail):
            return client

        collection = collection_from_details(
            self._db, client, collection_details
        )
        urns = request.args.getlist('urn')
        messages = []
        identifiers_by_urn, failures = Identifier.parse_urns(self._db, urns)

        for urn in failures:
            message = OPDSMessage(
                urn, INVALID_URN.status_code, INVALID_URN.detail
            )
            messages.append(message)

        # Find the subset of incoming identifiers that are already
        # in the catalog.
        already_in_catalog, ignore = self._in_catalog_subset(
            collection, identifiers_by_urn
        )

        # Everything else needs to be added to the catalog.
        needs_to_be_added = [
            x for x in identifiers_by_urn.values()
            if x.id not in already_in_catalog
        ]
        collection.catalog_identifiers(needs_to_be_added)

        for urn, identifier in identifiers_by_urn.items():
            if identifier.id in already_in_catalog:
                status = HTTP_OK
                description = "Already in catalog"
            else:
                status = HTTP_CREATED
                description = "Successfully added"

            messages.append(OPDSMessage(urn, status, description))

        title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url)
        url = self.collection_feed_url('add', collection, urn=urns)
        addition_feed = AcquisitionFeed(
            self._db, title, url, [], VerboseAnnotator,
            precomposed_entries=messages
        )

        return feed_response(addition_feed)

コード例 #11

0

ファイルを表示

ファイル: threem.py プロジェクト: dguo/circulation

    def handle_event(self, threem_id, isbn, foreign_patron_id,
                     start_time, end_time, internal_event_type):
        # Find or lookup the LicensePool for this event.
        license_pool, is_new = LicensePool.for_foreign_id(
            self._db, self.api.source, Identifier.THREEM_ID, threem_id)

        if is_new:
            # Immediately acquire bibliographic coverage for this book.
            # This will set the DistributionMechanisms and make the
            # book presentation-ready. However, its circulation information
            # might not be up to date until we process some more events.
            record = self.bibliographic_coverage_provider.ensure_coverage(
                license_pool.identifier, force=True
            )

        threem_identifier = license_pool.identifier
        isbn, ignore = Identifier.for_foreign_id(
            self._db, Identifier.ISBN, isbn)

        edition, ignore = Edition.for_foreign_id(
            self._db, self.api.source, Identifier.THREEM_ID, threem_id)

        # The ISBN and the 3M identifier are exactly equivalent.
        threem_identifier.equivalent_to(self.api.source, isbn, strength=1)

        # Log the event.
        event, was_new = get_one_or_create(
            self._db, CirculationEvent, license_pool=license_pool,
            type=internal_event_type, start=start_time,
            foreign_patron_id=foreign_patron_id,
            create_method_kwargs=dict(delta=1,end=end_time)
            )

        # If this is our first time seeing this LicensePool, log its
        # occurance as a separate event
        if is_new:
            event = get_one_or_create(
                self._db, CirculationEvent,
                type=CirculationEvent.TITLE_ADD,
                license_pool=license_pool,
                create_method_kwargs=dict(
                    start=license_pool.last_checked or start_time,
                    delta=1,
                    end=license_pool.last_checked or end_time,
                )
            )
        title = edition.title or "[no title]"
        self.log.info("%r %s: %s", start_time, title, internal_event_type)
        return start_time

コード例 #12

0

ファイルを表示

ファイル: opds.py プロジェクト: NYPL-Simplified/content_server

    def cover_links(cls, work):
        """The content server sends out _all_ cover links for the work.

        For books covered by Gutenberg Illustrated, this can be over a
        hundred cover links.
        """
        _db = Session.object_session(work)
        ids = work.all_identifier_ids()
        image_resources = Identifier.resources_for_identifier_ids(
            _db, ids, Resource.IMAGE)
        thumbnails = []
        full = []
        for cover in image_resources:
            if cover.mirrored_path:
                full.append(cover.mirrored_path)
            if cover.scaled_path:
                thumbnails.append(cover.scaled_path)
        return thumbnails, full

コード例 #13

0

ファイルを表示

    def oclc_number_for_isbn(self, isbn):
        """Turn an ISBN identifier into an OCLC Number identifier."""
        url = self.ISBN_BASE_URL % dict(id=isbn.identifier)
        representation, cached = Representation.get(
            self._db, url, Representation.http_get_no_redirect)
        if not representation.location:
            raise IOError(
                "Expected %s to redirect, but couldn't find location." % url
            )

        location = representation.location
        match = self.URI_WITH_OCLC_NUMBER.match(location)
        if not match:
            raise IOError(
                "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s" % location)
        oclc_number = match.groups()[0]
        return Identifier.for_foreign_id(
            self._db, Identifier.OCLC_NUMBER, oclc_number)[0]

コード例 #14

0

ファイルを表示

ファイル: coverage.py プロジェクト: datalogics-tsmith/circulation

 def handle_import_messages(self, messages_by_id):
     """Turn import messages from the OPDS importer into CoverageFailure
     objects.
     """
     for identifier, message in messages_by_id.items():
         # If the message indicates success but we didn't actually
         # get the data, treat it as a transient error.
         #
         # If the message does not indicate success, create a
         # CoverageRecord with the error so we stop trying this
         # book.
         if not message.success:
             exception = str(message.status_code)
             if message.message:
                 exception += ": %s" % message.message
             transient = message.transient
             identifier_obj, ignore = Identifier.parse_urn(self._db, identifier)
             yield CoverageFailure(self, identifier_obj, exception, transient)

コード例 #15

0

ファイルを表示

ファイル: bibblio.py プロジェクト: NYPL-Simplified/content_server

    def process_item(self, work):
        try:
            content_item = self.content_item_from_work(work)
            result = self.api.create_content_item(content_item)
        except Exception as e:
            return CoverageFailure(
                work, str(e), data_source=self.data_source,
                transient=True
            )

        content_item_id = result.get('contentItemId')
        bibblio_identifier, _is_new = Identifier.for_foreign_id(
            self._db, Identifier.BIBBLIO_CONTENT_ITEM_ID, content_item_id
        )

        identifier = work.presentation_edition.primary_identifier
        identifier.equivalent_to(self.data_source, bibblio_identifier, 1)

        return work

コード例 #16

0

ファイルを表示

    def test_process_urn_isbn(self):
        # Create a new ISBN identifier.
        # Ask online providers for metadata to turn into an opds feed about this identifier.
        # Make sure a coverage record was created, and a 201 status obtained from provider.
        # Ask online provider again, and make sure we're now getting a 202 "working on it" status.
        # Ask again, this time getting a result.  Make sure know that got a result.

        isbn, ignore = Identifier.for_foreign_id(
            self._db, Identifier.ISBN, self._isbn
        )

        # The first time we look up an ISBN a CoverageRecord is created
        # representing the work to be done.
        self.controller.process_urn(isbn.urn)
        self.assert_one_message(
            isbn.urn, HTTP_CREATED, self.controller.IDENTIFIER_REGISTERED
        )
        [record] = isbn.coverage_records
        eq_(record.exception, self.controller.NO_WORK_DONE_EXCEPTION)
        eq_(record.status, CoverageRecord.TRANSIENT_FAILURE)

        # So long as the necessary coverage is not provided,
        # future lookups will not provide useful information
        self.controller.precomposed_entries = []
        self.controller.process_urn(isbn.urn)
        self.assert_one_message(
            isbn.urn, HTTP_ACCEPTED, self.controller.WORKING_TO_RESOLVE_IDENTIFIER
        )

        # Let's provide the coverage.
        metadata_sources = DataSource.metadata_sources_for(
            self._db, isbn
        )
        for source in metadata_sources:
            CoverageRecord.add_for(isbn, source)

        # Process the ISBN again, and we get an <entry> tag with the
        # information.
        self.controller.precomposed_entries = []
        self.controller.process_urn(isbn.urn)
        expect = isbn.opds_entry()
        [actual] = self.controller.precomposed_entries
        eq_(etree.tostring(expect), etree.tostring(actual))

コード例 #17

0

ファイルを表示

 def lookup(self, identifier_or_uri, processed_uris=set()):
     """Perform an OCLC Open Data lookup for the given identifier."""
     type = None
     identifier = None
     if isinstance(identifier_or_uri, basestring):
         # e.g. http://experiment.worldcat.org/oclc/1862341597.json
         match = self.URI_WITH_OCLC_NUMBER.search(identifier_or_uri)
         if match:
             type = Identifier.OCLC_NUMBER
             id = match.groups()[0]
             if not type or not id:
                 return None, None
             identifier, is_new = Identifier.for_foreign_id(
                 self._db, type, id)
     else:
         identifier = identifier_or_uri
         type = identifier.type
     if not type or not identifier:
         return None, None
     return self.lookup_by_identifier(identifier, processed_uris)

コード例 #18

0

ファイルを表示

    def remove_items(self):
        collection = self.authenticated_collection_from_request()
        if isinstance(collection, ProblemDetail):
            return collection

        urns = request.args.getlist('urn')
        messages = []
        for urn in urns:
            message = None
            identifier = None
            try:
                identifier, ignore = Identifier.parse_urn(self._db, urn)
            except Exception as e:
                identifier = None
            if not identifier:
                message = OPDSMessage(
                    urn, INVALID_URN.status_code, INVALID_URN.detail
                )
            else:
                if identifier in collection.catalog:
                    collection.catalog.remove(identifier)
                    message = OPDSMessage(
                        urn, HTTP_OK, "Successfully removed"
                    )
                else:
                    message = OPDSMessage(
                        urn, HTTP_NOT_FOUND, "Not in collection catalog"
                    )
            if message:
                messages.append(message)

        title = "%s Catalog Item Removal" % collection.name
        url = cdn_url_for("remove", urn=urns)
        removal_feed = AcquisitionFeed(
            self._db, title, url, [], VerboseAnnotator,
            precomposed_entries=messages
        )

        return feed_response(removal_feed)

コード例 #19

0

ファイルを表示

ファイル: test_controller.py プロジェクト: rskm1/metadata_wrangler

    def test_process_urn_isbn(self):
        # Create a new ISBN identifier.
        # Ask online providers for metadata to turn into an opds feed about this identifier.
        # Make sure a coverage record was created, and a 201 status obtained from provider.
        # Ask online provider again, and make sure we're now getting a 202 "working on it" status.
        # Ask again, this time getting a result.  Make sure know that got a result.

        isbn, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN,
                                                 self._isbn)

        # The first time we look up an ISBN a CoverageRecord is created
        # representing the work to be done.
        self.controller.process_urn(isbn.urn)
        self.assert_one_message(isbn.urn, HTTP_CREATED,
                                self.controller.IDENTIFIER_REGISTERED)
        [record] = isbn.coverage_records
        eq_(record.exception, self.controller.NO_WORK_DONE_EXCEPTION)
        eq_(record.status, CoverageRecord.TRANSIENT_FAILURE)

        # So long as the necessary coverage is not provided,
        # future lookups will not provide useful information
        self.controller.precomposed_entries = []
        self.controller.process_urn(isbn.urn)
        self.assert_one_message(isbn.urn, HTTP_ACCEPTED,
                                self.controller.WORKING_TO_RESOLVE_IDENTIFIER)

        # Let's provide the coverage.
        metadata_sources = DataSource.metadata_sources_for(self._db, isbn)
        for source in metadata_sources:
            CoverageRecord.add_for(isbn, source)

        # Process the ISBN again, and we get an <entry> tag with the
        # information.
        self.controller.precomposed_entries = []
        self.controller.process_urn(isbn.urn)
        expect = isbn.opds_entry()
        [actual] = self.controller.precomposed_entries
        eq_(etree.tostring(expect), etree.tostring(actual))

コード例 #20

0

ファイルを表示

ファイル: coverage.py プロジェクト: datalogics/circulation

    def _process_batch(self, client_method, success_codes, batch):
        results = list()
        id_mapping = self.create_identifier_mapping(batch)
        mapped_batch = id_mapping.keys()

        try:
            response = client_method(mapped_batch)
            self.lookup_client.check_content_type(response)
        except RemoteIntegrationException as e:
            return [
                self.failure(id_mapping[obj], e.debug_message)
                for obj in mapped_batch
            ]

        for message in self.process_feed_response(response, id_mapping):
            try:
                identifier, _new = Identifier.parse_urn(self._db, message.urn)
                mapped_batch.remove(identifier)
            except ValueError as e:
                # For some reason this URN can't be parsed. This
                # shouldn't happen.
                continue

            if message.status_code in success_codes:
                result = id_mapping[identifier]
                results.append(result)
            elif message.status_code == 400:
                # The URN couldn't be recognized. (This shouldn't happen,
                # since if we can parse it here, we can parse it on MW, too.)
                exception = "%s: %s" % (message.status_code, message.message)
                failure = self.failure(identifier, exception)
                results.append(failure)
            else:
                exception = "Unknown OPDSMessage status: %s" % message.status_code
                failure = self.failure(identifier, exception)
                results.append(failure)

        return results

コード例 #21

0

ファイルを表示

ファイル: test_novelist.py プロジェクト: jonathangreen/circulation

    def test_lookup_info_to_metadata(self):
        # Basic book information is returned
        identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.ISBN, "9780804171335"
        )
        bad_character = self.sample_representation("a_bad_character.json")
        metadata = self.novelist.lookup_info_to_metadata(bad_character)

        assert True == isinstance(metadata, Metadata)
        assert Identifier.NOVELIST_ID == metadata.primary_identifier.type
        assert "10392078" == metadata.primary_identifier.identifier
        assert "A bad character" == metadata.title
        assert None == metadata.subtitle
        assert 1 == len(metadata.contributors)
        [contributor] = metadata.contributors
        assert "Kapoor, Deepti" == contributor.sort_name
        assert 4 == len(metadata.identifiers)
        assert 4 == len(metadata.subjects)
        assert 2 == len(metadata.measurements)
        ratings = sorted(metadata.measurements, key=lambda m: m.value)
        assert 2 == ratings[0].value
        assert 3.27 == ratings[1].value
        assert 625 == len(metadata.recommendations)

        # Confirm that Lexile and series data is extracted with a
        # different sample.
        vampire = self.sample_representation("vampire_kisses.json")
        metadata = self.novelist.lookup_info_to_metadata(vampire)

        [lexile] = filter(lambda s: s.type == "Lexile", metadata.subjects)
        assert "630" == lexile.identifier
        assert "Vampire kisses manga" == metadata.series
        # The full title should be selected, since every volume
        # has the same main title: 'Vampire kisses'
        assert "Vampire kisses: blood relatives. Volume 1" == metadata.title
        assert 1 == metadata.series_position
        assert 5 == len(metadata.recommendations)

コード例 #22

0

ファイルを表示

    def test_lookup_info_to_metadata(self):
        # Basic book information is returned
        identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.ISBN, "9780804171335"
        )
        bad_character = self.sample_representation("a_bad_character.json")
        metadata = self.novelist.lookup_info_to_metadata(bad_character)

        eq_(True, isinstance(metadata, Metadata))
        eq_(Identifier.NOVELIST_ID, metadata.primary_identifier.type)
        eq_('10392078', metadata.primary_identifier.identifier)
        eq_("A bad character", metadata.title)
        eq_(None, metadata.subtitle)
        eq_(1, len(metadata.contributors))
        [contributor] = metadata.contributors
        eq_("Kapoor, Deepti", contributor.sort_name)
        eq_(4, len(metadata.identifiers))
        eq_(4, len(metadata.subjects))
        eq_(2, len(metadata.measurements))
        ratings = sorted(metadata.measurements, key=lambda m: m.value)
        eq_(2, ratings[0].value)
        eq_(3.27, ratings[1].value)
        eq_(625, len(metadata.recommendations))

        # Confirm that Lexile and series data is extracted with a
        # different sample.
        vampire = self.sample_representation("vampire_kisses.json")
        metadata = self.novelist.lookup_info_to_metadata(vampire)

        [lexile] = filter(lambda s: s.type=='Lexile', metadata.subjects)
        eq_(u'630', lexile.identifier)
        eq_(u'Vampire kisses manga', metadata.series)
        # The full title should be selected, since every volume
        # has the same main title: 'Vampire kisses'
        eq_(u'Vampire kisses: blood relatives. Volume 1', metadata.title)
        eq_(1, metadata.series_position)
        eq_(5, len(metadata.recommendations))

コード例 #23

0

ファイルを表示

ファイル: test_novelist.py プロジェクト: dguo/circulation

    def test_lookup_info_to_metadata(self):
        # Basic book information is returned
        identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.ISBN, "9780804171335"
        )
        bad_character = self.sample_representation("a_bad_character.json")
        metadata = self.novelist.lookup_info_to_metadata(bad_character)

        eq_(True, isinstance(metadata, Metadata))
        eq_(Identifier.NOVELIST_ID, metadata.primary_identifier.type)
        eq_('10392078', metadata.primary_identifier.identifier)
        eq_("A bad character", metadata.title)
        eq_(None, metadata.subtitle)
        eq_(1, len(metadata.contributors))
        [contributor] = metadata.contributors
        eq_("Kapoor, Deepti", contributor.sort_name)
        eq_(4, len(metadata.identifiers))
        eq_(4, len(metadata.subjects))
        eq_(2, len(metadata.measurements))
        ratings = sorted(metadata.measurements, key=lambda m: m.value)
        eq_(2, ratings[0].value)
        eq_(3.27, ratings[1].value)
        eq_(625, len(metadata.recommendations))

        # Confirm that Lexile and series data is extracted with a
        # different sample.
        vampire = self.sample_representation("vampire_kisses.json")
        metadata = self.novelist.lookup_info_to_metadata(vampire)

        [lexile] = filter(lambda s: s.type=='Lexile', metadata.subjects)
        eq_(u'630', lexile.identifier)
        eq_(u'Vampire kisses manga', metadata.series)
        # The full title should be selected, since every volume
        # has the same main title: 'Vampire kisses'
        eq_(u'Vampire kisses: blood relatives. Volume 1', metadata.title)
        eq_(1, metadata.series_position)
        eq_(5, len(metadata.recommendations))

コード例 #24

0

ファイルを表示

ファイル: oclc.py プロジェクト: rskm1/metadata_wrangler

    def oclc_works_for_isbn(self, isbn, processed_uris=set()):
        """Yield every OCLC Work graph for the given ISBN."""
        # Find the OCLC Number for this ISBN.
        oclc_number = self.oclc_number_for_isbn(isbn)

        # Retrieve the OCLC Linked Data document for that OCLC Number.
        oclc_number_data, was_new = self.lookup_by_identifier(
            oclc_number, processed_uris)
        if not oclc_number_data:
            return

        # Look up every work referenced in that document and yield its data.
        graph = OCLCLinkedData.graph(oclc_number_data)
        works = OCLCLinkedData.extract_works(graph)
        for work_uri in works:
            m = self.URI_WITH_OCLC_WORK_ID.match(work_uri)
            if m:
                work_id = m.groups()[0]
                identifier, was_new = Identifier.for_foreign_id(
                    self._db, Identifier.OCLC_WORK, work_id)

                oclc_work_data, cached = self.lookup_by_identifier(
                    identifier, processed_uris)
                yield oclc_work_data

コード例 #25

0

ファイルを表示

    def oclc_works_for_isbn(self, isbn, processed_uris=set()):
        """Yield every OCLC Work graph for the given ISBN."""
        # Find the OCLC Number for this ISBN.
        oclc_number = self.oclc_number_for_isbn(isbn)

        # Retrieve the OCLC Linked Data document for that OCLC Number.
        oclc_number_data, was_new = self.lookup_by_identifier(
            oclc_number, processed_uris)
        if not oclc_number_data:
            return

        # Look up every work referenced in that document and yield its data.
        graph = OCLCLinkedData.graph(oclc_number_data)
        works = OCLCLinkedData.extract_works(graph)
        for work_uri in works:
            m = self.URI_WITH_OCLC_WORK_ID.match(work_uri)
            if m:
                work_id = m.groups()[0]
                identifier, was_new = Identifier.for_foreign_id(
                    self._db, Identifier.OCLC_WORK, work_id)

                oclc_work_data, cached = self.lookup_by_identifier(
                    identifier, processed_uris)
                yield oclc_work_data

コード例 #26

0

ファイルを表示

ファイル: 20171012-2-register-identifiers-for-resolution.py プロジェクト: wjzhu-class/metadata_wrangler

    which won't be necessary for this migration.
    """
    def __init__(self, collection):
        super(IdentifierResolutionCoverageProvider, self).__init__(
            collection, registered_only=True
        )

try:
    _db = production_session()
    registrar = IdentifierResolutionRegistrar(_db)

    log.info('Finding unresolved identifiers')
    data_source = DataSource.lookup(_db, DataSource.INTERNAL_PROCESSING)
    unresolved_qu = Identifier.missing_coverage_from(
        _db, [], data_source,
        operation=CoverageRecord.RESOLVE_IDENTIFIER_OPERATION,
        count_as_covered=CoverageRecord.SUCCESS
    ).filter(CoverageRecord.id != None)

    log.info('Finding unaffiliated identifiers without a collection')
    unresolved_and_unaffiliated = unresolved_qu.outerjoin(Identifier.collections)\
        .group_by(Identifier.id).having(func.count(Collection.id)==0)\
        .options(lazyload(Identifier.licensed_through)).distinct()

    if unresolved_and_unaffiliated.count() > 1:
        # Use a bulk insert to add them all to the unaffiliated_collection.
        log.info('Giving all unaffiliated identifiers a collection')
        unaffiliated_collection, ignore = MockResolver.unaffiliated_collection(_db)
        _db.execute(
            collections_identifiers.insert(),
            [

コード例 #27

0

ファイルを表示

ファイル: oneclick.py プロジェクト: datalogics-tarar/circulation

            # go through patron's checkouts and generate LoanInfo objects,
            # with FulfillmentInfo objects included
            media_type = item.get('mediaType', 'eBook')
            isbn = item.get('isbn', None)
            can_renew = item.get('canRenew', None)
            title = item.get('title', None)
            authors = item.get('authors', None)
            # refers to checkout expiration date, not the downloadUrl's
            expires = item.get('expiration', None)
            if expires:
                expires = datetime.datetime.strptime(
                    expires, self.EXPIRATION_DATE_FORMAT).date()

            identifier, made_new = Identifier.for_foreign_id(
                self._db,
                foreign_identifier_type=Identifier.ONECLICK_ID,
                foreign_id=isbn,
                autocreate=False)

            # Note: if OneClick knows about a patron's checked-out item that wasn't
            # checked out through us, we ignore it
            if not identifier:
                continue

            files = item.get('files', None)
            for file in files:
                filename = file.get('filename', None)
                # assume fileFormat is same for all files associated with this checkout
                # and use the last one mentioned.  Ex: "fileFormat": "EPUB".
                # note: audio books don't list fileFormat field, just the filename, and the mediaType.
                file_format = file.get('fileFormat', None)

コード例 #28

0

ファイルを表示

ファイル: oclc.py プロジェクト: rskm1/metadata_wrangler

    def process_item(self, identifier):
        try:
            new_info_counter = Counter()
            self.log.info("Processing identifier %r", identifier)
            metadatas = [m for m in self.api.info_for(identifier)]

            if identifier.type == Identifier.ISBN:
                # Currently info_for seeks the results of OCLC Work IDs only
                # This segment will get the metadata of any equivalent OCLC Numbers
                # as well.
                equivalents = Identifier.recursively_equivalent_identifier_ids(
                    self._db, [identifier.id])
                oclc_numbers = self._db.query(Identifier).\
                    filter(Identifier.id.in_(equivalents)).\
                    filter(Identifier.type==Identifier.OCLC_NUMBER).all()
                for oclc_number in oclc_numbers:
                    more_metadata = [m for m in self.api.info_for(oclc_number)]
                    metadatas += more_metadata
                    metadatas = [m for m in metadatas if m]

            for metadata in metadatas:
                other_identifier, ignore = metadata.primary_identifier.load(
                    self._db)
                oclc_editions = other_identifier.primarily_identifies

                # Keep track of the number of editions OCLC associates
                # with this identifier.
                other_identifier.add_measurement(
                    self.data_source, Measurement.PUBLISHED_EDITIONS,
                    len(oclc_editions))

                # Clean up contributor information.
                self.apply_viaf_to_contributor_data(metadata)
                # Remove any empty ContributorData objects that may have
                # been created.
                metadata.contributors = filter(
                    lambda c: c.sort_name or c.display_name,
                    metadata.contributors)

                # When metadata is applied, it must be given a client that can
                # response to 'canonicalize_author_name'. Usually this is an
                # OPDSImporter that reaches out to the Metadata Wrangler, but
                # in the case of being _on_ the Metadata Wrangler...:
                from canonicalize import AuthorNameCanonicalizer
                metadata_client = AuthorNameCanonicalizer(self._db,
                                                          oclcld=self.api,
                                                          viaf=self.viaf)

                num_new_isbns = self.new_isbns(metadata)
                new_info_counter['isbns'] += num_new_isbns
                if oclc_editions:
                    # There are existing OCLC editions. Apply any new information to them.
                    for edition in oclc_editions:
                        metadata, new_info_counter = self.apply_metadata_to_edition(
                            edition, metadata, metadata_client,
                            new_info_counter)
                else:
                    # Create a new OCLC edition to hold the information.
                    edition, ignore = get_one_or_create(
                        self._db,
                        Edition,
                        data_source=self.data_source,
                        primary_identifier=other_identifier)
                    metadata, new_info_counter = self.apply_metadata_to_edition(
                        edition, metadata, metadata_client, new_info_counter)
                    # Set the new OCLC edition's identifier equivalent to this
                    # identifier so we know they're related.
                    self.set_equivalence(identifier, metadata)

                self.log.info(
                    "Total: %(editions)d editions, %(isbns)d ISBNs, "\
                    "%(descriptions)d descriptions, %(subjects)d classifications.",
                    new_info_counter
                )
        except IOError as e:
            if ", but couldn't find location" in e.message:
                exception = "OCLC doesn't know about this ISBN: %r" % e
                transient = False
            else:
                exception = "OCLC raised an error: %r" % e
                transient = True
            return self.failure(identifier, exception, transient=transient)
        return identifier

コード例 #29

0

ファイルを表示

ファイル: controller.py プロジェクト: NYPL-Simplified/metadata_wrangler

    def process_urns(self, urns, collection_details=None, **kwargs):
        """Processes URNs submitted via lookup request

        An authenticated request can process up to 30 URNs at once,
        but must specify a collection under which to catalog the
        URNs. This is used when initially recording the fact that
        certain URNs are in a collection, to get a baseline set of
        metadata. Updates on the books should be obtained through the
        CatalogController.

        An unauthenticated request is used for testing. Such a request
        does not have to specify a collection (the "Unaffiliated"
        collection is used), but can only process one URN at a time.

        :return: None or ProblemDetail

        """
        client = authenticated_client_from_request(self._db, required=False)
        if isinstance(client, ProblemDetail):
            return client

        resolve_now = request.args.get('resolve_now', None) is not None

        collection = collection_from_details(
            self._db, client, collection_details
        )

        if client:
            # Authenticated access.
            if not collection:
                return INVALID_INPUT.detailed(_("No collection provided."))
            limit = 30
        else:
            # Anonymous access.
            collection = self.default_collection
            limit = 1

        if resolve_now:
            # You can't force-resolve more than one Identifier at a time.
            limit = 1

        if len(urns) > limit:
            return INVALID_INPUT.detailed(
                _("The maximum number of URNs you can provide at once is %d. (You sent %d)") % (limit, len(urns))
            )
        identifiers_by_urn, failures = Identifier.parse_urns(
            self._db, urns, allowed_types=self.VALID_TYPES
        )
        self.add_urn_failure_messages(failures)

        # Catalog all identifiers.
        collection.catalog_identifiers(identifiers_by_urn.values())

        # Load all coverage records in a single query to speed up the
        # code that reports on the status of Identifiers that aren't
        # ready.
        self.bulk_load_coverage_records(identifiers_by_urn.values())

        resolver = IdentifierResolutionCoverageProvider(
            collection, provide_coverage_immediately=resolve_now,
            **self.coverage_provider_kwargs
        )
        for urn, identifier in identifiers_by_urn.items():
            self.process_identifier(
                identifier, urn, resolver=resolver
            )

コード例 #30

0

ファイルを表示

 def do_run(self, _db):
     identifier = Identifier(type="Keep It", identifier="100")
     _db.add(identifier)

コード例 #31

0

ファイルを表示

ファイル: controller.py プロジェクト: NYPL-Simplified/metadata_wrangler

    def add_with_metadata(self, collection_details):
        """Adds identifiers with their metadata to a Collection's catalog"""
        client = authenticated_client_from_request(self._db)
        if isinstance(client, ProblemDetail):
            return client

        collection = collection_from_details(
            self._db, client, collection_details
        )

        data_source = DataSource.lookup(
            self._db, collection.name, autocreate=True
        )

        messages = []

        feed = feedparser.parse(request.data)
        entries = feed.get("entries", [])
        entries_by_urn = { entry.get('id') : entry for entry in entries }

        identifiers_by_urn, invalid_urns = Identifier.parse_urns(
            self._db, entries_by_urn.keys()
        )

        messages = list()

        for urn in invalid_urns:
            messages.append(OPDSMessage(
                urn, INVALID_URN.status_code, INVALID_URN.detail
            ))


        for urn, identifier in identifiers_by_urn.items():
            entry = entries_by_urn[urn]
            status = HTTP_OK
            description = "Already in catalog"

            if identifier not in collection.catalog:
                collection.catalog_identifier(identifier)
                status = HTTP_CREATED
                description = "Successfully added"

            message = OPDSMessage(urn, status, description)

            # Get a cover if it exists.
            image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE])
            images = [l for l in entry.get("links", []) if l.get("rel") in image_types]
            links = [LinkData(image.get("rel"), image.get("href")) for image in images]

            # Create an edition to hold the title and author. LicensePool.calculate_work
            # refuses to create a Work when there's no title, and if we have a title, author
            # and language we can attempt to look up the edition in OCLC.
            title = entry.get("title") or "Unknown Title"
            author = ContributorData(
                sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR),
                roles=[Contributor.PRIMARY_AUTHOR_ROLE]
            )
            language = entry.get("dcterms_language")

            presentation = PresentationCalculationPolicy(
                choose_edition=False,
                set_edition_metadata=False,
                classify=False,
                choose_summary=False,
                calculate_quality=False,
                choose_cover=False,
                regenerate_opds_entries=False,
            )
            replace = ReplacementPolicy(presentation_calculation_policy=presentation)
            metadata = Metadata(
                data_source,
                primary_identifier=IdentifierData(identifier.type, identifier.identifier),
                title=title,
                language=language,
                contributors=[author],
                links=links,
            )

            edition, ignore = metadata.edition(self._db)
            metadata.apply(edition, collection, replace=replace)

            messages.append(message)

        title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url)
        url = self.collection_feed_url("add_with_metadata", collection)
        addition_feed = AcquisitionFeed(
            self._db, title, url, [], VerboseAnnotator,
            precomposed_entries=messages
        )

        return feed_response(addition_feed)

コード例 #32

0

ファイルを表示

    def lookup_info_to_metadata(self, lookup_representation):
        """Transforms a NoveList JSON representation into a Metadata object"""

        if not lookup_representation.content:
            return None

        lookup_info = json.loads(lookup_representation.content)
        book_info = lookup_info['TitleInfo']
        if book_info:
            novelist_identifier = book_info.get('ui')
        if not book_info or not novelist_identifier:
            # NoveList didn't know the ISBN.
            return None

        primary_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, novelist_identifier)
        metadata = Metadata(self.source, primary_identifier=primary_identifier)

        # Get the equivalent ISBN identifiers.
        metadata.identifiers += self._extract_isbns(book_info)

        author = book_info.get('author')
        if author:
            metadata.contributors.append(ContributorData(sort_name=author))

        description = book_info.get('description')
        if description:
            metadata.links.append(
                LinkData(rel=Hyperlink.DESCRIPTION,
                         content=description,
                         media_type=Representation.TEXT_PLAIN))

        audience_level = book_info.get('audience_level')
        if audience_level:
            metadata.subjects.append(
                SubjectData(Subject.FREEFORM_AUDIENCE, audience_level))

        novelist_rating = book_info.get('rating')
        if novelist_rating:
            metadata.measurements.append(
                MeasurementData(Measurement.RATING, novelist_rating))

        # Extract feature content if it is available.
        series_info = None
        appeals_info = None
        lexile_info = None
        goodreads_info = None
        recommendations_info = None
        feature_content = lookup_info.get('FeatureContent')
        if feature_content:
            series_info = feature_content.get('SeriesInfo')
            appeals_info = feature_content.get('Appeals')
            lexile_info = feature_content.get('LexileInfo')
            goodreads_info = feature_content.get('GoodReads')
            recommendations_info = feature_content.get('SimilarTitles')

        metadata, title_key = self.get_series_information(
            metadata, series_info, book_info)
        metadata.title = book_info.get(title_key)
        subtitle = TitleProcessor.extract_subtitle(metadata.title,
                                                   book_info.get('full_title'))
        metadata.subtitle = self._scrub_subtitle(subtitle)

        # TODO: How well do we trust this data? We could conceivably bump up
        # the weight here.
        if appeals_info:
            extracted_genres = False
            for appeal in appeals_info:
                genres = appeal.get('genres')
                if genres:
                    for genre in genres:
                        metadata.subjects.append(
                            SubjectData(Subject.TAG, genre['Name']))
                        extracted_genres = True
                if extracted_genres:
                    break

        if lexile_info:
            metadata.subjects.append(
                SubjectData(Subject.LEXILE_SCORE, lexile_info['Lexile']))

        if goodreads_info:
            metadata.measurements.append(
                MeasurementData(Measurement.RATING,
                                goodreads_info['average_rating']))

        metadata = self.get_recommendations(metadata, recommendations_info)

        # If nothing interesting comes from the API, ignore it.
        if not (metadata.measurements or metadata.series_position
                or metadata.series or metadata.subjects or metadata.links
                or metadata.subtitle or metadata.recommendations):
            metadata = None
        return metadata

コード例 #33

0

ファイルを表示

ファイル: test_circulation.py プロジェクト: NYPL-Simplified/circulation

    )
from threem import ThreeMAPI
from overdrive import OverdriveAPI
from axis import Axis360API

from circulation import CirculationAPI
from circulation_exceptions import *

barcode, pin, borrow_urn, hold_urn = sys.argv[1:5]
email = os.environ.get('DEFAULT_NOTIFICATION_EMAIL_ADDRESS', '*****@*****.**')

_db = production_session()
patron, ignore = get_one_or_create(
    _db, Patron, authorization_identifier=barcode)

borrow_identifier = Identifier.parse_urn(_db, borrow_urn, True)[0]
hold_identifier = Identifier.parse_urn(_db, hold_urn, True)[0]
borrow_pool = borrow_identifier.licensed_through
hold_pool = hold_identifier.licensed_through

if any(x.type == Identifier.THREEM_ID for x in [borrow_identifier, hold_identifier]):
    threem = ThreeMAPI(_db)
else:
    threem = None

if any(x.type == Identifier.OVERDRIVE_ID for x in [borrow_identifier, hold_identifier]):
    overdrive = OverdriveAPI(_db)
else:
    overdrive = None

if any(x.type == Identifier.AXIS_360_ID for x in [borrow_identifier, hold_identifier]):

コード例 #34

0

ファイルを表示

    def add_with_metadata(self, collection_details):
        """Adds identifiers with their metadata to a Collection's catalog"""
        client = authenticated_client_from_request(self._db)
        if isinstance(client, ProblemDetail):
            return client

        collection = collection_from_details(self._db, client,
                                             collection_details)

        data_source = DataSource.lookup(self._db,
                                        collection.name,
                                        autocreate=True)

        messages = []

        feed = feedparser.parse(request.data)
        entries = feed.get("entries", [])
        entries_by_urn = {entry.get('id'): entry for entry in entries}

        identifiers_by_urn, invalid_urns = Identifier.parse_urns(
            self._db, entries_by_urn.keys())

        messages = list()

        for urn in invalid_urns:
            messages.append(
                OPDSMessage(urn, INVALID_URN.status_code, INVALID_URN.detail))

        for urn, identifier in identifiers_by_urn.items():
            entry = entries_by_urn[urn]
            status = HTTP_OK
            description = "Already in catalog"

            if identifier not in collection.catalog:
                collection.catalog_identifier(identifier)
                status = HTTP_CREATED
                description = "Successfully added"

            message = OPDSMessage(urn, status, description)

            # Get a cover if it exists.
            image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE])
            images = [
                l for l in entry.get("links", [])
                if l.get("rel") in image_types
            ]
            links = [
                LinkData(image.get("rel"), image.get("href"))
                for image in images
            ]

            # Create an edition to hold the title and author. LicensePool.calculate_work
            # refuses to create a Work when there's no title, and if we have a title, author
            # and language we can attempt to look up the edition in OCLC.
            title = entry.get("title") or "Unknown Title"
            author = ContributorData(sort_name=(entry.get("author")
                                                or Edition.UNKNOWN_AUTHOR),
                                     roles=[Contributor.PRIMARY_AUTHOR_ROLE])
            language = entry.get("dcterms_language")

            presentation = PresentationCalculationPolicy(
                choose_edition=False,
                set_edition_metadata=False,
                classify=False,
                choose_summary=False,
                calculate_quality=False,
                choose_cover=False,
                regenerate_opds_entries=False,
            )
            replace = ReplacementPolicy(
                presentation_calculation_policy=presentation)
            metadata = Metadata(
                data_source,
                primary_identifier=IdentifierData(identifier.type,
                                                  identifier.identifier),
                title=title,
                language=language,
                contributors=[author],
                links=links,
            )

            edition, ignore = metadata.edition(self._db)
            metadata.apply(edition, collection, replace=replace)

            messages.append(message)

        title = "%s Catalog Item Additions for %s" % (collection.protocol,
                                                      client.url)
        url = self.collection_feed_url("add_with_metadata", collection)
        addition_feed = AcquisitionFeed(self._db,
                                        title,
                                        url, [],
                                        VerboseAnnotator,
                                        precomposed_entries=messages)

        return feed_response(addition_feed)

コード例 #35

0

ファイルを表示

ファイル: scripts.py プロジェクト: NYPL-Simplified/metadata_wrangler

 def run(self):
     id_type, identifier = sys.argv[1:]
     identifier, ignore = Identifier.for_foreign_id(
         self._db, id_type, identifier
     )
     self.fix_identifier(identifier)

コード例 #36

0

ファイルを表示

 def run(self):
     id_type, identifier = sys.argv[1:]
     identifier, ignore = Identifier.for_foreign_id(self._db, id_type,
                                                    identifier)
     self.fix_identifier(identifier)

コード例 #37

0

ファイルを表示

class OCLCXMLParser(XMLParser):

    # OCLC in-representation 'status codes'
    SINGLE_WORK_SUMMARY_STATUS = 0
    SINGLE_WORK_DETAIL_STATUS = 2
    MULTI_WORK_STATUS = 4
    NO_INPUT_STATUS = 100
    INVALID_INPUT_STATUS = 101
    NOT_FOUND_STATUS = 102
    UNEXPECTED_ERROR_STATUS = 200

    INTS = set([OCLC.HOLDING_COUNT, OCLC.EDITION_COUNT])

    NAMESPACES = {'oclc': 'http://classify.oclc.org'}

    LIST_TYPE = "works"
    log = logging.getLogger("OCLC XML Parser")

    @classmethod
    def parse(cls, _db, xml, **restrictions):
        """Turn XML data from the OCLC lookup service into a list of SWIDs
        (for a multi-work response) or a list of Edition
        objects (for a single-work response).
        """
        tree = etree.fromstring(xml, parser=etree.XMLParser(recover=True))
        response = cls._xpath1(tree, "oclc:response")
        representation_type = int(response.get('code'))

        workset_record = None
        editions = []
        edition_records = []

        if representation_type == cls.UNEXPECTED_ERROR_STATUS:
            raise IOError("Unexpected error from OCLC API: %s" % xml)
        elif representation_type in (cls.NO_INPUT_STATUS,
                                     cls.INVALID_INPUT_STATUS):
            return representation_type, []
        elif representation_type == cls.SINGLE_WORK_SUMMARY_STATUS:
            raise IOError(
                "Got single-work summary from OCLC despite requesting detail: %s"
                % xml)

        # The real action happens here.
        if representation_type == cls.SINGLE_WORK_DETAIL_STATUS:
            authors_tag = cls._xpath1(tree, "//oclc:authors")

            work_tag = cls._xpath1(tree, "//oclc:work")
            if work_tag is not None:
                author_string = work_tag.get('author')
                primary_author = cls.primary_author_from_author_string(
                    _db, author_string)

            existing_authors = cls.extract_authors(
                _db, authors_tag, primary_author=primary_author)

            # The representation lists a single work, its authors, its editions,
            # plus summary classification information for the work.
            edition, ignore = cls.extract_edition(_db, work_tag,
                                                  existing_authors,
                                                  **restrictions)
            if edition:
                cls.log.info("EXTRACTED %r", edition)
            records = []
            if edition:
                records.append(edition)
            else:
                # The work record itself failed one of the
                # restrictions. None of its editions are likely to
                # succeed either.
                return representation_type, records

        elif representation_type == cls.MULTI_WORK_STATUS:
            # The representation lists a set of works that match the
            # search query.
            cls.log.debug("Extracting SWIDs from search results.")
            records = cls.extract_swids(_db, tree, **restrictions)
        elif representation_type == cls.NOT_FOUND_STATUS:
            # No problem; OCLC just doesn't have any data.
            records = []
        else:
            raise IOError("Unrecognized status code from OCLC API: %s (%s)" %
                          (representation_type, xml))

        return representation_type, records

    @classmethod
    def extract_swids(cls, _db, tree, **restrictions):
        """Turn a multi-work response into a list of SWIDs."""

        swids = []
        for work_tag in cls._xpath(tree, "//oclc:work"):
            # We're not calling extract_basic_info because we care about
            # the info, we're calling it to make sure this work meets
            # the restriction. If this work meets the restriction,
            # we'll store its info when we look up the SWID.
            response = cls._extract_basic_info(_db, work_tag, **restrictions)
            if response:
                title, author_names, language = response
                # TODO: 'swid' is what it's called in older representations.
                # That code can be removed once we replace all representations.
                work_identifier = work_tag.get('wi') or work_tag.get('swid')
                cls.log.debug("WORK ID %s (%s, %r, %s)", work_identifier,
                              title, author_names, language)
                swids.append(work_identifier)
        return swids

    ROLES = re.compile("\[([^]]+)\]$")
    LIFESPAN = re.compile("([0-9]+)-([0-9]*)[.;]?$")

    @classmethod
    def extract_authors(cls, _db, authors_tag, primary_author=None):
        results = []
        if authors_tag is not None:
            for author_tag in cls._xpath(authors_tag, "//oclc:author"):
                lc = author_tag.get('lc', None)
                viaf = author_tag.get('viaf', None)
                contributor, roles, default_role_used = cls._parse_single_author(
                    _db,
                    author_tag.text,
                    lc=lc,
                    viaf=viaf,
                    primary_author=primary_author)
                if contributor:
                    results.append(contributor)

        return results

    @classmethod
    def _contributor_match(cls, contributor, name, lc, viaf):
        return (contributor.sort_name == name
                and (lc is None or contributor.lc == lc)
                and (viaf is None or contributor.viaf == viaf))

    @classmethod
    def _parse_single_author(cls,
                             _db,
                             author,
                             lc=None,
                             viaf=None,
                             existing_authors=[],
                             default_role=Contributor.AUTHOR_ROLE,
                             primary_author=None):
        default_role_used = False
        # First find roles if present
        # "Giles, Lionel, 1875-1958 [Writer of added commentary; Translator]"
        author = author.strip()
        m = cls.ROLES.search(author)
        if m:
            author = author[:m.start()].strip()
            role_string = m.groups()[0]
            roles = [x.strip() for x in role_string.split(";")]
        elif default_role:
            roles = [default_role]
            default_role_used = True
        else:
            roles = []

        # Author string now looks like
        # "Giles, Lionel, 1875-1958"
        m = cls.LIFESPAN.search(author)
        kwargs = dict()
        if m:
            author = author[:m.start()].strip()
            birth, death = m.groups()
            if birth:
                kwargs[Contributor.BIRTH_DATE] = birth
            if death:
                kwargs[Contributor.DEATH_DATE] = death

        # Author string now looks like
        # "Giles, Lionel,"
        if author.endswith(","):
            author = author[:-1]

        contributor = None
        if not author:
            # No name was given for the author.
            return None, roles, default_role_used

        if primary_author and author == primary_author.sort_name:
            if Contributor.AUTHOR_ROLE in roles:
                roles.remove(Contributor.AUTHOR_ROLE)
            if Contributor.UNKNOWN_ROLE in roles:
                roles.remove(Contributor.UNKNOWN_ROLE)
            roles.insert(0, Contributor.PRIMARY_AUTHOR_ROLE)

        if existing_authors:
            # Calling Contributor.lookup will result in a database
            # hit, and looking up a contributor based on name may
            # result in multiple results (see below). We'll have no
            # way of distinguishing between those results. If
            # possible, it's much more reliable to look through
            # existing_authors (the authors derived from an entry's
            # <authors> tag).
            for x in existing_authors:
                if cls._contributor_match(x, author, lc, viaf):
                    contributor = x
                    break
            if contributor:
                was_new = False

        if not contributor:
            contributor, was_new = Contributor.lookup(_db,
                                                      author,
                                                      viaf,
                                                      lc,
                                                      extra=kwargs)
        if isinstance(contributor, list):
            # We asked for an author based solely on the name, which makes
            # Contributor.lookup() return a list.
            if len(contributor) == 1:
                # Fortunately, either the database knows about only
                # one author with that name, or it didn't know about
                # any authors with that name and it just created one,
                # so we can unambiguously use it.
                contributor = contributor[0]
            else:
                # Uh-oh. The database knows about multiple authors
                # with that name.  We have no basis for deciding which
                # author we mean. But we would prefer to identify with
                # an author who has a known LC or VIAF number.
                #
                # This should happen very rarely because of our check
                # against existing_authors above. But it will happen
                # for authors that have a work in Project Gutenberg.
                with_id = [
                    x for x in contributor
                    if x.lc is not None or x.viaf is not None
                ]
                if with_id:
                    contributor = with_id[0]
                else:
                    contributor = contributor[0]
        return contributor, roles, default_role_used

    @classmethod
    def primary_author_from_author_string(cls, _db, author_string):
        # If the first author mentioned in the author string
        # does not have an explicit role set, treat them as the primary
        # author.
        if not author_string:
            return None
        authors = author_string.split("|")
        if not authors:
            return None
        author, roles, default_role_used = cls._parse_single_author(
            _db, authors[0], default_role=Contributor.PRIMARY_AUTHOR_ROLE)
        if roles == [Contributor.PRIMARY_AUTHOR_ROLE]:
            return author
        return None

    @classmethod
    def parse_author_string(cls,
                            _db,
                            author_string,
                            existing_authors=[],
                            primary_author=None):
        default_role = Contributor.PRIMARY_AUTHOR_ROLE
        authors = []
        if not author_string:
            return authors
        for author in author_string.split("|"):
            author, roles, default_role_used = cls._parse_single_author(
                _db,
                author,
                existing_authors=existing_authors,
                default_role=default_role,
                primary_author=primary_author)
            if roles:
                if Contributor.PRIMARY_AUTHOR_ROLE in roles:
                    # That was the primary author.  If we see someone
                    # with no explicit role after this point, they're
                    # just a regular author.
                    default_role = Contributor.AUTHOR_ROLE
                elif not default_role_used:
                    # We're dealing with someone whose role was
                    # explicitly specified. If we see someone with no
                    # explicit role after this point, it's probably
                    # because their role is so minor as to not be
                    # worth mentioning, not because it's so major that
                    # we can assume they're an author.
                    default_role = Contributor.UNKNOWN_ROLE
            roles = roles or [default_role]
            if author:
                authors.append((author, roles))
        return authors

    @classmethod
    def _extract_basic_info(cls,
                            _db,
                            tag,
                            existing_authors=None,
                            **restrictions):
        """Extract information common to work tag and edition tag."""
        title = tag.get('title')
        author_string = tag.get('author')
        authors_and_roles = cls.parse_author_string(_db, author_string,
                                                    existing_authors)
        if 'language' in tag.keys():
            language = tag.get('language')
        else:
            language = None

        if title and 'title' in restrictions:
            must_resemble_title = restrictions['title']
            threshold = restrictions.get('title_similarity', 0.25)
            similarity = MetadataSimilarity.title_similarity(
                must_resemble_title, title)
            if similarity < threshold:
                # The title of the book under consideration is not
                # similar enough to the given title.
                cls.log.debug("FAILURE TO RESEMBLE: %s vs %s (%.2f)", title,
                              must_resemble_title, similarity)
                return None

            # The semicolon is frequently used to separate multiple
            # works in an anthology. If there is no semicolon in the
            # original title, do not consider titles that contain
            # semicolons.
            if (not ' ; ' in must_resemble_title and ' ; ' in title
                    and threshold > 0):
                cls.log.debug("SEMICOLON DISQUALIFICATION: %s", title)
                return None

        # Apply restrictions. If they're not met, return None.
        if 'language' in restrictions and language:
            # We know which language this record is for. Match it
            # against the language used in the Edition we're
            # matching against.
            restrict_to_language = set(restrictions['language'])
            if language != restrict_to_language:
                # This record is for a book in a different language
                cls.log.debug("WRONG LANGUAGE: %s", language)
                return None

        if 'authors' in restrictions:
            restrict_to_authors = restrictions['authors']
            if restrict_to_authors and isinstance(restrict_to_authors[0],
                                                  Contributor):
                restrict_to_authors = [
                    x.sort_name for x in restrict_to_authors
                ]
            primary_author = None

            for a, roles in authors_and_roles:
                if Contributor.PRIMARY_AUTHOR_ROLE in roles:
                    primary_author = a
                    break
            if (not primary_author or
                (primary_author not in restrict_to_authors
                 and primary_author.sort_name not in restrict_to_authors)):
                # None of the given authors showed up as the
                # primary author of this book. They may have had
                # some other role in it, or the book may be about
                # them, or incorporate their work, but this book
                # is not *by* them.
                return None

        author_names = ", ".join([x.sort_name for x, y in authors_and_roles])

        return title, authors_and_roles, language

    UNUSED_MEDIA = set([
        "itemtype-intmm",
        "itemtype-msscr",
        "itemtype-artchap-artcl",
        "itemtype-jrnl",
        "itemtype-map",
        "itemtype-vis",
        "itemtype-jrnl-digital",
        "itemtype-image-2d",
        "itemtype-artchap-digital",
        "itemtype-intmm-digital",
        "itemtype-archv",
        "itemtype-msscr-digital",
        "itemtype-game",
        "itemtype-web-digital",
        "itemtype-map-digital",
    ])

    @classmethod
    def extract_edition(cls, _db, work_tag, existing_authors, **restrictions):
        """Create a new Edition object with information about a
        work (identified by OCLC Work ID).
        """
        # TODO: 'pswid' is what it's called in older representations.
        # That code can be removed once we replace all representations.
        oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid'))
        # if oclc_work_id:
        #     print " owi: %s" % oclc_work_id
        # else:
        #     print " No owi in %s" % etree.tostring(work_tag)

        if not oclc_work_id:
            raise ValueError("Work has no owi")

        item_type = work_tag.get("itemtype")
        if (item_type.startswith('itemtype-book')
                or item_type.startswith('itemtype-compfile')):
            medium = Edition.BOOK_MEDIUM
        elif item_type.startswith('itemtype-audiobook'
                                  ) or item_type.startswith('itemtype-music'):
            # Pretty much all Gutenberg texts, even the audio texts,
            # are based on a book, and the ones that aren't
            # (recordings of individual songs) probably aren't in OCLC
            # anyway. So we just want to get the books.
            medium = Edition.AUDIO_MEDIUM
            medium = None
        elif item_type.startswith('itemtype-video'):
            #medium = Edition.VIDEO_MEDIUM
            medium = None
        elif item_type in cls.UNUSED_MEDIA:
            medium = None
        else:
            medium = None

        # Only create Editions for books with a recognized medium
        if medium is None:
            return None, False

        result = cls._extract_basic_info(_db, work_tag, existing_authors,
                                         **restrictions)
        if not result:
            # This record did not meet one of the restrictions.
            return None, False

        title, authors_and_roles, language = result

        # Record some extra OCLC-specific information
        editions = work_tag.get('editions')
        holdings = work_tag.get('holdings')

        # Get an identifier for this work.
        identifier, ignore = Identifier.for_foreign_id(_db,
                                                       Identifier.OCLC_WORK,
                                                       oclc_work_id)

        data_source = DataSource.lookup(_db, DataSource.OCLC)
        identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings)
        identifier.add_measurement(data_source, Measurement.PUBLISHED_EDITIONS,
                                   editions)

        # Create a Edition for source + identifier
        edition, new = get_one_or_create(_db,
                                         Edition,
                                         data_source=data_source,
                                         primary_identifier=identifier,
                                         create_method_kwargs=dict(
                                             title=title,
                                             language=language,
                                         ))

        # Get the most popular Dewey and LCC classification for this
        # work.
        for tag_name, subject_type in (("ddc", Subject.DDC), ("lcc",
                                                              Subject.LCC)):
            tag = cls._xpath1(work_tag,
                              "//oclc:%s/oclc:mostPopular" % tag_name)
            if tag is not None:
                id = tag.get('nsfa') or tag.get('sfa')
                weight = int(tag.get('holdings'))
                identifier.classify(data_source,
                                    subject_type,
                                    id,
                                    weight=weight)

        # Find FAST subjects for the work.
        for heading in cls._xpath(work_tag, "//oclc:fast//oclc:heading"):
            id = heading.get('ident')
            weight = int(heading.get('heldby'))
            value = heading.text
            identifier.classify(data_source, Subject.FAST, id, value, weight)

        # Associate the authors with the Edition.
        for contributor, roles in authors_and_roles:
            edition.add_contributor(contributor, roles)
        return edition, new

    @classmethod
    def extract_edition_record(cls, _db, edition_tag, existing_authors,
                               **restrictions):
        """Create a new Edition object with information about an
        edition of a book (identified by OCLC Number).
        """
        oclc_number = unicode(edition_tag.get('oclc'))
        try:
            int(oclc_number)
        except ValueError, e:
            # This record does not have a valid OCLC number.
            return None, False

        # Fill in some basic information about this new record.
        result = cls._extract_basic_info(_db, edition_tag, existing_authors,
                                         **restrictions)
        if not result:
            # This record did not meet one of the restrictions.
            return None, False

        title, authors_and_roles, language = result

        # Add a couple extra bits of OCLC-specific information.
        extra = {
            OCLC.HOLDING_COUNT: edition_tag.get('holdings'),
            OCLC.FORMAT: edition_tag.get('itemtype'),
        }

        # Get an identifier for this edition.
        identifier, ignore = Identifier.for_foreign_id(_db,
                                                       Identifier.OCLC_NUMBER,
                                                       oclc_number)

        # Create a Edition for source + identifier
        data_source = DataSource.lookup(_db, DataSource.OCLC)
        edition_record, new = get_one_or_create(_db,
                                                Edition,
                                                data_source=data_source,
                                                primary_identifier=identifier,
                                                create_method_kwargs=dict(
                                                    title=title,
                                                    language=language,
                                                    subjects=subjects,
                                                    extra=extra,
                                                ))

        subjects = {}
        for subject_type, oclc_code in ((Subject.LCC, "050"), (Subject.DDC,
                                                               "082")):
            classification = cls._xpath1(
                edition_tag,
                "oclc:classifications/oclc:class[@tag=%s]" % oclc_code)
            if classification is not None:
                value = classification.get("nsfa") or classification.get('sfa')
                identifier.classify(data_source, subject_type, value)

        # Associated each contributor with the new record.
        for author, roles in authors_and_roles:
            edition_record.add_contributor(author, roles)
        return edition_record, new

コード例 #38

0

ファイルを表示

class AnnotationParser(object):
    @classmethod
    def parse(cls, _db, data, patron):
        if patron.synchronize_annotations != True:
            return PATRON_NOT_OPTED_IN_TO_ANNOTATION_SYNC

        try:
            data = json.loads(data)
            data = jsonld.expand(data)
        except ValueError, e:
            return INVALID_ANNOTATION_FORMAT

        if not data or not len(data) == 1:
            return INVALID_ANNOTATION_TARGET
        data = data[0]

        target = data.get("http://www.w3.org/ns/oa#hasTarget")
        if not target or not len(target) == 1:
            return INVALID_ANNOTATION_TARGET
        target = target[0]

        source = target.get("http://www.w3.org/ns/oa#hasSource")

        if not source or not len(source) == 1:
            return INVALID_ANNOTATION_TARGET
        source = source[0].get('@id')

        identifier, ignore = Identifier.parse_urn(_db, source)

        motivation = data.get("http://www.w3.org/ns/oa#motivatedBy")
        if not motivation or not len(motivation) == 1:
            return INVALID_ANNOTATION_MOTIVATION
        motivation = motivation[0].get('@id')
        if motivation not in Annotation.MOTIVATIONS:
            return INVALID_ANNOTATION_MOTIVATION

        loans = patron.loans
        loan_identifiers = [loan.license_pool.identifier for loan in loans]
        if identifier not in loan_identifiers:
            return INVALID_ANNOTATION_TARGET

        content = data.get("http://www.w3.org/ns/oa#hasBody")
        if content and len(content) == 1:
            content = content[0]
        else:
            content = None

        target = json.dumps(target)
        extra_kwargs = {}
        if motivation == Annotation.IDLING:
            # A given book can only have one 'idling' annotation.
            pass
        elif motivation == Annotation.BOOKMARKING:
            # A given book can only have one 'bookmarking' annotation
            # per target.
            extra_kwargs['target'] = target

        annotation, ignore = Annotation.get_one_or_create(
            _db,
            patron=patron,
            identifier=identifier,
            motivation=motivation,
            **extra_kwargs)
        annotation.target = target
        if content:
            annotation.content = json.dumps(content)
        annotation.active = True
        annotation.timestamp = datetime.now()

        return annotation

コード例 #39

0

ファイルを表示

ファイル: linked_data.py プロジェクト: NYPL-Simplified/metadata_wrangler

    def process_item(self, identifier):
        # Books are not looked up in OCLC Linked Data directly, since
        # there is no Collection that identifies a book by its OCLC Number.
        # However, when a book is looked up through OCLC Classify, some
        # OCLC Numbers may be associated with it, and _those_ numbers
        # can be run through OCLC Linked Data.
        try:
            new_info_counter = Counter()
            self.log.info("Processing identifier %r", identifier)
            metadatas = [m for m in self.api.info_for(identifier)]

            if identifier.type==Identifier.ISBN:
                # Currently info_for seeks the results of OCLC Work IDs only
                # This segment will get the metadata of any equivalent OCLC Numbers
                # as well.
                equivalents = Identifier.recursively_equivalent_identifier_ids(
                    self._db, [identifier.id]
                )
                oclc_numbers = self._db.query(Identifier).\
                    filter(Identifier.id.in_(equivalents)).\
                    filter(Identifier.type==Identifier.OCLC_NUMBER).all()
                for oclc_number in oclc_numbers:
                    more_metadata = [m for m in self.api.info_for(oclc_number)]
                    metadatas += more_metadata
                    metadatas = [m for m in metadatas if m]

            for metadata in metadatas:
                other_identifier, ignore = metadata.primary_identifier.load(self._db)
                oclc_editions = other_identifier.primarily_identifies

                # Keep track of the number of editions OCLC associates
                # with this identifier.
                other_identifier.add_measurement(
                    self.data_source, Measurement.PUBLISHED_EDITIONS,
                    len(oclc_editions)
                )

                # Clean up contributor information.
                self.apply_viaf_to_contributor_data(metadata)
                # Remove any empty ContributorData objects that may have
                # been created.
                metadata.contributors = filter(
                    lambda c: c.sort_name or c.display_name,
                    metadata.contributors
                )

                # When metadata is applied, it must be given a client that can
                # response to 'canonicalize_author_name'. Usually this is an
                # OPDSImporter that reaches out to the Metadata Wrangler, but
                # in the case of being _on_ the Metadata Wrangler...:
                from canonicalize import AuthorNameCanonicalizer
                metadata_client = AuthorNameCanonicalizer(
                    self._db, oclcld=self.api, viaf=self.viaf
                )

                num_new_isbns = self.new_isbns(metadata)
                new_info_counter['isbns'] += num_new_isbns
                if oclc_editions:
                    # There are existing OCLC editions. Apply any new information to them.
                    for edition in oclc_editions:
                        metadata, new_info_counter = self.apply_metadata_to_edition(
                            edition, metadata, metadata_client, new_info_counter
                        )
                else:
                    # Create a new OCLC edition to hold the information.
                    edition, ignore = get_one_or_create(
                        self._db, Edition, data_source=self.data_source,
                        primary_identifier=other_identifier
                    )
                    metadata, new_info_counter = self.apply_metadata_to_edition(
                        edition, metadata, metadata_client, new_info_counter
                    )
                    # Set the new OCLC edition's identifier equivalent to this
                    # identifier so we know they're related.
                    self.set_equivalence(identifier, metadata)

                self.log.info(
                    "Total: %(editions)d editions, %(isbns)d ISBNs, "\
                    "%(descriptions)d descriptions, %(subjects)d classifications.",
                    new_info_counter
                )
        except IOError as e:
            if ", but couldn't find location" in e.message:
                exception = "OCLC doesn't know about this ISBN: %r" % e
                transient = False
            else:
                exception = "OCLC raised an error: %r" % e
                transient = True
            return self.failure(identifier, exception, transient=transient)

        # Try to calculate or recalculate a work for ISBNs.
        #
        # We won't do this for other Identifier types because we don't want
        # to overwrite the high-quality metadata direct from the source.
        # With ISBNs, that higher-quality metadata is not available, so we
        # depend on OCLC for title and author information.
        if identifier.type == Identifier.ISBN:
            self.calculate_work_for_isbn(identifier)

        return identifier

コード例 #40

0

ファイルを表示

    def process_urns(self, urns, collection_details=None, **kwargs):
        """Processes URNs submitted via lookup request

        An authenticated request can process up to 30 URNs at once,
        but must specify a collection under which to catalog the
        URNs. This is used when initially recording the fact that
        certain URNs are in a collection, to get a baseline set of
        metadata. Updates on the books should be obtained through the
        CatalogController.

        An unauthenticated request is used for testing. Such a request
        does not have to specify a collection (the "Unaffiliated"
        collection is used), but can only process one URN at a time.

        :return: None or ProblemDetail

        """
        client = authenticated_client_from_request(self._db, required=False)
        if isinstance(client, ProblemDetail):
            return client

        resolve_now = request.args.get('resolve_now', None) is not None

        collection = collection_from_details(self._db, client,
                                             collection_details)

        if client:
            # Authenticated access.
            if not collection:
                return INVALID_INPUT.detailed(_("No collection provided."))
            limit = 30
        else:
            # Anonymous access.
            collection = self.default_collection
            limit = 1

        if resolve_now:
            # You can't force-resolve more than one Identifier at a time.
            limit = 1

        if len(urns) > limit:
            return INVALID_INPUT.detailed(
                _("The maximum number of URNs you can provide at once is %d. (You sent %d)"
                  ) % (limit, len(urns)))
        identifiers_by_urn, failures = Identifier.parse_urns(
            self._db, urns, allowed_types=self.VALID_TYPES)
        self.add_urn_failure_messages(failures)

        # Catalog all identifiers.
        collection.catalog_identifiers(identifiers_by_urn.values())

        # Load all coverage records in a single query to speed up the
        # code that reports on the status of Identifiers that aren't
        # ready.
        self.bulk_load_coverage_records(identifiers_by_urn.values())

        resolver = IdentifierResolutionCoverageProvider(
            collection,
            provide_coverage_immediately=resolve_now,
            **self.coverage_provider_kwargs)
        for urn, identifier in identifiers_by_urn.items():
            self.process_identifier(identifier, urn, resolver=resolver)

コード例 #41

0

ファイルを表示

    def extract_edition(cls, _db, work_tag, existing_authors, **restrictions):
        """Create a new Edition object with information about a
        work (identified by OCLC Work ID).
        """
        # TODO: 'pswid' is what it's called in older representations.
        # That code can be removed once we replace all representations.
        oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid'))
        # if oclc_work_id:
        #     print " owi: %s" % oclc_work_id
        # else:
        #     print " No owi in %s" % etree.tostring(work_tag)


        if not oclc_work_id:
            raise ValueError("Work has no owi")

        item_type = work_tag.get("itemtype")
        if (item_type.startswith('itemtype-book')
            or item_type.startswith('itemtype-compfile')):
            medium = Edition.BOOK_MEDIUM
        elif item_type.startswith('itemtype-audiobook') or item_type.startswith('itemtype-music'):
            # Pretty much all Gutenberg texts, even the audio texts,
            # are based on a book, and the ones that aren't
            # (recordings of individual songs) probably aren't in OCLC
            # anyway. So we just want to get the books.
            medium = Edition.AUDIO_MEDIUM
            medium = None
        elif item_type.startswith('itemtype-video'):
            #medium = Edition.VIDEO_MEDIUM
            medium = None
        elif item_type in cls.UNUSED_MEDIA:
            medium = None
        else:
            medium = None

        # Only create Editions for books with a recognized medium
        if medium is None:
            return None, False

        result = cls._extract_basic_info(_db, work_tag, existing_authors, **restrictions)
        if not result:
            # This record did not meet one of the restrictions.
            return None, False

        title, authors_and_roles, language = result

        # Record some extra OCLC-specific information
        editions = work_tag.get('editions')
        holdings = work_tag.get('holdings')

        # Get an identifier for this work.
        identifier, ignore = Identifier.for_foreign_id(
            _db, Identifier.OCLC_WORK, oclc_work_id
        )

        data_source = DataSource.lookup(_db, DataSource.OCLC)
        identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings)
        identifier.add_measurement(
            data_source, Measurement.PUBLISHED_EDITIONS, editions)


        # Create a Edition for source + identifier
        edition, new = get_one_or_create(
            _db, Edition,
            data_source=data_source,
            primary_identifier=identifier,
            create_method_kwargs=dict(
                title=title,
                language=language,
            )
        )

        # Get the most popular Dewey and LCC classification for this
        # work.
        for tag_name, subject_type in (
                ("ddc", Subject.DDC),
                ("lcc", Subject.LCC)):
            tag = cls._xpath1(
                work_tag,
                "//oclc:%s/oclc:mostPopular" % tag_name)
            if tag is not None:
                id = tag.get('nsfa') or tag.get('sfa')
                weight = int(tag.get('holdings'))
                identifier.classify(
                    data_source, subject_type, id, weight=weight)

        # Find FAST subjects for the work.
        for heading in cls._xpath(
                work_tag, "//oclc:fast//oclc:heading"):
            id = heading.get('ident')
            weight = int(heading.get('heldby'))
            value = heading.text
            identifier.classify(
                data_source, Subject.FAST, id, value, weight)

        # Associate the authors with the Edition.
        for contributor, roles in authors_and_roles:
            edition.add_contributor(contributor, roles)
        return edition, new

コード例 #42

0

ファイルを表示

    def test_recursively_equivalent_identifiers(self):

        # We start with a Gutenberg book.
        gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG)
        record, ignore = Edition.for_foreign_id(self._db, gutenberg,
                                                Identifier.GUTENBERG_ID, "100")
        gutenberg_id = record.primary_identifier

        # We use OCLC Classify to do a title/author lookup.
        oclc = DataSource.lookup(self._db, DataSource.OCLC)
        search_id, ignore = Identifier.for_foreign_id(self._db,
                                                      Identifier.OCLC_WORK,
                                                      "60010")
        gutenberg_id.equivalent_to(oclc, search_id, 1)

        # The title/author lookup associates the search term with two
        # different OCLC Numbers.
        oclc_id, ignore = Identifier.for_foreign_id(self._db,
                                                    Identifier.OCLC_NUMBER,
                                                    "9999")
        oclc_id_2, ignore = Identifier.for_foreign_id(self._db,
                                                      Identifier.OCLC_NUMBER,
                                                      "1000")

        search_id.equivalent_to(oclc, oclc_id, 1)
        search_id.equivalent_to(oclc, oclc_id_2, 1)

        # We then use OCLC Linked Data to connect one of the OCLC
        # Numbers with an ISBN.
        linked_data = DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)
        isbn_id, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN,
                                                    "900100434X")
        oclc_id.equivalent_to(linked_data, isbn_id, 1)

        # As it turns out, we have an Overdrive work record...
        overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE)
        overdrive_record, ignore = Edition.for_foreign_id(
            self._db, overdrive, Identifier.OVERDRIVE_ID, "{111-222}")
        overdrive_id = overdrive_record.primary_identifier

        # ...which is tied (by Overdrive) to the same ISBN.
        overdrive_id.equivalent_to(overdrive, isbn_id, 1)

        # Finally, here's a completely unrelated Edition, which
        # will not be showing up.
        gutenberg2, ignore = Edition.for_foreign_id(self._db, gutenberg,
                                                    Identifier.GUTENBERG_ID,
                                                    "200")
        gutenberg2.title = "Unrelated Gutenberg record."

        levels = [
            record.equivalent_identifiers(policy=PresentationCalculationPolicy(
                equivalent_identifier_levels=i)) for i in range(0, 5)
        ]

        # At level 0, the only identifier found is the Gutenberg ID.
        assert set([gutenberg_id]) == set(levels[0])

        # At level 1, we pick up the title/author lookup.
        assert set([gutenberg_id, search_id]) == set(levels[1])

        # At level 2, we pick up the title/author lookup and the two
        # OCLC Numbers.
        assert set([gutenberg_id, search_id, oclc_id,
                    oclc_id_2]) == set(levels[2])

        # At level 3, we also pick up the ISBN.
        assert set([gutenberg_id, search_id, oclc_id, oclc_id_2,
                    isbn_id]) == set(levels[3])

        # At level 4, the recursion starts to go in the other
        # direction: we pick up the Overdrive ID that's equivalent to
        # the same ISBN as the OCLC Number.
        assert set([
            gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id, overdrive_id
        ]) == set(levels[4])

コード例 #43

0

ファイルを表示

 def do_run(self, _db):
     identifier = Identifier(type="You Can", identifier="Keep It")
     _db.add(identifier)
     raise RuntimeError

コード例 #44

0

ファイルを表示

                               message=message,
                               action=action)

        # by now we can assume response is either empty or a list
        for item in resp_obj:
            # go through patron's holds and HoldInfo objects.
            media_type = item.get('mediaType', 'eBook')
            isbn = item.get('isbn', None)
            title = item.get('title', None)
            authors = item.get('authors', None)
            expires = item.get('expiration', None)
            if expires:
                expires = datetime.datetime.strptime(
                    expires, self.EXPIRATION_DATE_FORMAT).date()

            identifier = Identifier.from_asin(self._db, isbn, autocreate=False)
            # Note: if OneClick knows about a patron's checked-out item that wasn't
            # checked out through us, we ignore it
            if not identifier:
                continue

            hold = HoldInfo(self.collection,
                            DataSource.RB_DIGITAL,
                            Identifier.RB_DIGITAL_ID,
                            isbn,
                            start_date=None,
                            end_date=expires,
                            hold_position=None)

            holds.append(hold)

コード例 #45

0

ファイルを表示

ファイル: novelist.py プロジェクト: NYPL-Simplified/circulation

    def lookup_info_to_metadata(self, lookup_representation):
        """Transforms a NoveList JSON representation into a Metadata object"""

        if not lookup_representation.content:
            return None

        lookup_info = json.loads(lookup_representation.content)
        book_info = lookup_info['TitleInfo']
        if book_info:
            novelist_identifier = book_info.get('ui')
        if not book_info or not novelist_identifier:
            # NoveList didn't know the ISBN.
            return None

        primary_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, novelist_identifier
        )
        metadata = Metadata(self.source, primary_identifier=primary_identifier)

        # Get the equivalent ISBN identifiers.
        metadata.identifiers += self._extract_isbns(book_info)

        author = book_info.get('author')
        if author:
            metadata.contributors.append(ContributorData(sort_name=author))

        description = book_info.get('description')
        if description:
            metadata.links.append(LinkData(
                rel=Hyperlink.DESCRIPTION, content=description,
                media_type=Representation.TEXT_PLAIN
            ))

        audience_level = book_info.get('audience_level')
        if audience_level:
            metadata.subjects.append(SubjectData(
                Subject.FREEFORM_AUDIENCE, audience_level
            ))

        novelist_rating = book_info.get('rating')
        if novelist_rating:
            metadata.measurements.append(MeasurementData(
                Measurement.RATING, novelist_rating
            ))

        # Extract feature content if it is available.
        series_info = None
        appeals_info = None
        lexile_info = None
        goodreads_info = None
        recommendations_info = None
        feature_content = lookup_info.get('FeatureContent')
        if feature_content:
            series_info = feature_content.get('SeriesInfo')
            appeals_info = feature_content.get('Appeals')
            lexile_info = feature_content.get('LexileInfo')
            goodreads_info = feature_content.get('GoodReads')
            recommendations_info = feature_content.get('SimilarTitles')

        metadata, title_key = self.get_series_information(
            metadata, series_info, book_info
        )
        metadata.title = book_info.get(title_key)
        subtitle = TitleProcessor.extract_subtitle(
            metadata.title, book_info.get('full_title')
        )
        metadata.subtitle = self._scrub_subtitle(subtitle)

        if appeals_info:
            extracted_genres = False
            for appeal in appeals_info:
                genres = appeal.get('genres')
                if genres:
                    for genre in genres:
                        metadata.subjects.append(SubjectData(
                            Subject.TAG, genre['Name']
                        ))
                        extracted_genres = True
                if extracted_genres:
                    break

        if lexile_info:
            metadata.subjects.append(SubjectData(
                Subject.LEXILE_SCORE, lexile_info['Lexile']
            ))

        if goodreads_info:
            metadata.measurements.append(MeasurementData(
                Measurement.RATING, goodreads_info['average_rating']
            ))

        metadata = self.get_recommendations(metadata, recommendations_info)

        # If nothing interesting comes from the API, ignore it.
        if not (metadata.measurements or metadata.series_position or
            metadata.series or metadata.subjects or metadata.links or
            metadata.subtitle or metadata.recommendations
        ):
            metadata = None
        return metadata

コード例 #46

0

ファイルを表示

ファイル: test_circulation.py プロジェクト: jonathangreen/circulation

from overdrive import OverdriveAPI
from threem import ThreeMAPI

from circulation import CirculationAPI
from core.model import Identifier, Patron, get_one_or_create, production_session

barcode, pin, borrow_urn, hold_urn = sys.argv[1:5]
email = os.environ.get("DEFAULT_NOTIFICATION_EMAIL_ADDRESS",
                       "*****@*****.**")

_db = production_session()
patron, ignore = get_one_or_create(_db,
                                   Patron,
                                   authorization_identifier=barcode)

borrow_identifier = Identifier.parse_urn(_db, borrow_urn, True)[0]
hold_identifier = Identifier.parse_urn(_db, hold_urn, True)[0]
borrow_pool = borrow_identifier.licensed_through
hold_pool = hold_identifier.licensed_through

if any(x.type == Identifier.THREEM_ID
       for x in [borrow_identifier, hold_identifier]):
    threem = ThreeMAPI(_db)
else:
    threem = None

if any(x.type == Identifier.OVERDRIVE_ID
       for x in [borrow_identifier, hold_identifier]):
    overdrive = OverdriveAPI(_db)
else:
    overdrive = None

コード例 #47

0

ファイルを表示

    def extract_edition(cls, _db, work_tag, existing_authors, **restrictions):
        """Create a new Edition object with information about a
        work (identified by OCLC Work ID).
        """
        # TODO: 'pswid' is what it's called in older representations.
        # That code can be removed once we replace all representations.
        oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid'))
        # if oclc_work_id:
        #     print " owi: %s" % oclc_work_id
        # else:
        #     print " No owi in %s" % etree.tostring(work_tag)

        if not oclc_work_id:
            raise ValueError("Work has no owi")

        item_type = work_tag.get("itemtype")
        if (item_type.startswith('itemtype-book')
                or item_type.startswith('itemtype-compfile')):
            medium = Edition.BOOK_MEDIUM
        elif item_type.startswith('itemtype-audiobook'
                                  ) or item_type.startswith('itemtype-music'):
            # Pretty much all Gutenberg texts, even the audio texts,
            # are based on a book, and the ones that aren't
            # (recordings of individual songs) probably aren't in OCLC
            # anyway. So we just want to get the books.
            medium = Edition.AUDIO_MEDIUM
            medium = None
        elif item_type.startswith('itemtype-video'):
            #medium = Edition.VIDEO_MEDIUM
            medium = None
        elif item_type in cls.UNUSED_MEDIA:
            medium = None
        else:
            medium = None

        # Only create Editions for books with a recognized medium
        if medium is None:
            return None, False

        result = cls._extract_basic_info(_db, work_tag, existing_authors,
                                         **restrictions)
        if not result:
            # This record did not meet one of the restrictions.
            return None, False

        title, authors_and_roles, language = result

        # Record some extra OCLC-specific information
        editions = work_tag.get('editions')
        holdings = work_tag.get('holdings')

        # Get an identifier for this work.
        identifier, ignore = Identifier.for_foreign_id(_db,
                                                       Identifier.OCLC_WORK,
                                                       oclc_work_id)

        data_source = DataSource.lookup(_db, DataSource.OCLC)
        identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings)
        identifier.add_measurement(data_source, Measurement.PUBLISHED_EDITIONS,
                                   editions)

        # Create a Edition for source + identifier
        edition, new = get_one_or_create(_db,
                                         Edition,
                                         data_source=data_source,
                                         primary_identifier=identifier,
                                         create_method_kwargs=dict(
                                             title=title,
                                             language=language,
                                         ))

        # Get the most popular Dewey and LCC classification for this
        # work.
        for tag_name, subject_type in (("ddc", Subject.DDC), ("lcc",
                                                              Subject.LCC)):
            tag = cls._xpath1(work_tag,
                              "//oclc:%s/oclc:mostPopular" % tag_name)
            if tag is not None:
                id = tag.get('nsfa') or tag.get('sfa')
                weight = int(tag.get('holdings'))
                identifier.classify(data_source,
                                    subject_type,
                                    id,
                                    weight=weight)

        # Find FAST subjects for the work.
        for heading in cls._xpath(work_tag, "//oclc:fast//oclc:heading"):
            id = heading.get('ident')
            weight = int(heading.get('heldby'))
            value = heading.text
            identifier.classify(data_source, Subject.FAST, id, value, weight)

        # Associate the authors with the Edition.
        for contributor, roles in authors_and_roles:
            edition.add_contributor(contributor, roles)
        return edition, new