Esempio n. 1
0
    def test_confirm_same_identifier(self):
        source = DataSource.lookup(self._db, DataSource.NOVELIST)
        identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, '84752928'
        )
        unmatched_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, '23781947'
        )
        metadata = Metadata(source, primary_identifier=identifier)
        match = Metadata(source, primary_identifier=identifier)
        mistake = Metadata(source, primary_identifier=unmatched_identifier)

        eq_(False, self.novelist._confirm_same_identifier([metadata, mistake]))
        eq_(True, self.novelist._confirm_same_identifier([metadata, match]))
    def oclc_number_for_isbn(self, isbn):
        """Turn an ISBN identifier into an OCLC Number identifier."""

        # Let's pretend any id can be an oclc id.
        oclc_number = isbn.identifier
        oclc_identifier, made_new = Identifier.for_foreign_id(
            self._db, Identifier.OCLC_NUMBER, oclc_number, autocreate=True)

        return oclc_identifier
Esempio n. 3
0
    def handle_event(self, threem_id, isbn, foreign_patron_id,
                     start_time, end_time, internal_event_type):
        # Find or lookup the LicensePool for this event.
        license_pool, is_new = LicensePool.for_foreign_id(
            self._db, self.api.source, Identifier.THREEM_ID, threem_id)

        if is_new:
            # Immediately acquire bibliographic coverage for this book.
            # This will set the DistributionMechanisms and make the
            # book presentation-ready. However, its circulation information
            # might not be up to date until we process some more events.
            record = self.bibliographic_coverage_provider.ensure_coverage(
                license_pool.identifier, force=True
            )

        threem_identifier = license_pool.identifier
        isbn, ignore = Identifier.for_foreign_id(
            self._db, Identifier.ISBN, isbn)

        edition, ignore = Edition.for_foreign_id(
            self._db, self.api.source, Identifier.THREEM_ID, threem_id)

        # The ISBN and the 3M identifier are exactly equivalent.
        threem_identifier.equivalent_to(self.api.source, isbn, strength=1)

        # Log the event.
        event, was_new = get_one_or_create(
            self._db, CirculationEvent, license_pool=license_pool,
            type=internal_event_type, start=start_time,
            foreign_patron_id=foreign_patron_id,
            create_method_kwargs=dict(delta=1,end=end_time)
            )

        # If this is our first time seeing this LicensePool, log its
        # occurance as a separate event
        if is_new:
            event = get_one_or_create(
                self._db, CirculationEvent,
                type=CirculationEvent.TITLE_ADD,
                license_pool=license_pool,
                create_method_kwargs=dict(
                    start=license_pool.last_checked or start_time,
                    delta=1,
                    end=license_pool.last_checked or end_time,
                )
            )
        title = edition.title or "[no title]"
        self.log.info("%r %s: %s", start_time, title, internal_event_type)
        return start_time
Esempio n. 4
0
    def oclc_number_for_isbn(self, isbn):
        """Turn an ISBN identifier into an OCLC Number identifier."""
        url = self.ISBN_BASE_URL % dict(id=isbn.identifier)
        representation, cached = Representation.get(
            self._db, url, Representation.http_get_no_redirect)
        if not representation.location:
            raise IOError(
                "Expected %s to redirect, but couldn't find location." % url
            )

        location = representation.location
        match = self.URI_WITH_OCLC_NUMBER.match(location)
        if not match:
            raise IOError(
                "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s" % location)
        oclc_number = match.groups()[0]
        return Identifier.for_foreign_id(
            self._db, Identifier.OCLC_NUMBER, oclc_number)[0]
Esempio n. 5
0
    def process_item(self, work):
        try:
            content_item = self.content_item_from_work(work)
            result = self.api.create_content_item(content_item)
        except Exception as e:
            return CoverageFailure(
                work, str(e), data_source=self.data_source,
                transient=True
            )

        content_item_id = result.get('contentItemId')
        bibblio_identifier, _is_new = Identifier.for_foreign_id(
            self._db, Identifier.BIBBLIO_CONTENT_ITEM_ID, content_item_id
        )

        identifier = work.presentation_edition.primary_identifier
        identifier.equivalent_to(self.data_source, bibblio_identifier, 1)

        return work
Esempio n. 6
0
    def test_process_urn_isbn(self):
        # Create a new ISBN identifier.
        # Ask online providers for metadata to turn into an opds feed about this identifier.
        # Make sure a coverage record was created, and a 201 status obtained from provider.
        # Ask online provider again, and make sure we're now getting a 202 "working on it" status.
        # Ask again, this time getting a result.  Make sure know that got a result.

        isbn, ignore = Identifier.for_foreign_id(
            self._db, Identifier.ISBN, self._isbn
        )

        # The first time we look up an ISBN a CoverageRecord is created
        # representing the work to be done.
        self.controller.process_urn(isbn.urn)
        self.assert_one_message(
            isbn.urn, HTTP_CREATED, self.controller.IDENTIFIER_REGISTERED
        )
        [record] = isbn.coverage_records
        eq_(record.exception, self.controller.NO_WORK_DONE_EXCEPTION)
        eq_(record.status, CoverageRecord.TRANSIENT_FAILURE)

        # So long as the necessary coverage is not provided,
        # future lookups will not provide useful information
        self.controller.precomposed_entries = []
        self.controller.process_urn(isbn.urn)
        self.assert_one_message(
            isbn.urn, HTTP_ACCEPTED, self.controller.WORKING_TO_RESOLVE_IDENTIFIER
        )

        # Let's provide the coverage.
        metadata_sources = DataSource.metadata_sources_for(
            self._db, isbn
        )
        for source in metadata_sources:
            CoverageRecord.add_for(isbn, source)

        # Process the ISBN again, and we get an <entry> tag with the
        # information.
        self.controller.precomposed_entries = []
        self.controller.process_urn(isbn.urn)
        expect = isbn.opds_entry()
        [actual] = self.controller.precomposed_entries
        eq_(etree.tostring(expect), etree.tostring(actual))
Esempio n. 7
0
 def lookup(self, identifier_or_uri, processed_uris=set()):
     """Perform an OCLC Open Data lookup for the given identifier."""
     type = None
     identifier = None
     if isinstance(identifier_or_uri, basestring):
         # e.g. http://experiment.worldcat.org/oclc/1862341597.json
         match = self.URI_WITH_OCLC_NUMBER.search(identifier_or_uri)
         if match:
             type = Identifier.OCLC_NUMBER
             id = match.groups()[0]
             if not type or not id:
                 return None, None
             identifier, is_new = Identifier.for_foreign_id(
                 self._db, type, id)
     else:
         identifier = identifier_or_uri
         type = identifier.type
     if not type or not identifier:
         return None, None
     return self.lookup_by_identifier(identifier, processed_uris)
Esempio n. 8
0
    def test_lookup_info_to_metadata(self):
        # Basic book information is returned
        identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.ISBN, "9780804171335"
        )
        bad_character = self.sample_representation("a_bad_character.json")
        metadata = self.novelist.lookup_info_to_metadata(bad_character)

        eq_(True, isinstance(metadata, Metadata))
        eq_(Identifier.NOVELIST_ID, metadata.primary_identifier.type)
        eq_('10392078', metadata.primary_identifier.identifier)
        eq_("A bad character", metadata.title)
        eq_(None, metadata.subtitle)
        eq_(1, len(metadata.contributors))
        [contributor] = metadata.contributors
        eq_("Kapoor, Deepti", contributor.sort_name)
        eq_(4, len(metadata.identifiers))
        eq_(4, len(metadata.subjects))
        eq_(2, len(metadata.measurements))
        ratings = sorted(metadata.measurements, key=lambda m: m.value)
        eq_(2, ratings[0].value)
        eq_(3.27, ratings[1].value)
        eq_(625, len(metadata.recommendations))

        # Confirm that Lexile and series data is extracted with a
        # different sample.
        vampire = self.sample_representation("vampire_kisses.json")
        metadata = self.novelist.lookup_info_to_metadata(vampire)

        [lexile] = filter(lambda s: s.type=='Lexile', metadata.subjects)
        eq_(u'630', lexile.identifier)
        eq_(u'Vampire kisses manga', metadata.series)
        # The full title should be selected, since every volume
        # has the same main title: 'Vampire kisses'
        eq_(u'Vampire kisses: blood relatives. Volume 1', metadata.title)
        eq_(1, metadata.series_position)
        eq_(5, len(metadata.recommendations))
Esempio n. 9
0
    def oclc_works_for_isbn(self, isbn, processed_uris=set()):
        """Yield every OCLC Work graph for the given ISBN."""
        # Find the OCLC Number for this ISBN.
        oclc_number = self.oclc_number_for_isbn(isbn)

        # Retrieve the OCLC Linked Data document for that OCLC Number.
        oclc_number_data, was_new = self.lookup_by_identifier(
            oclc_number, processed_uris)
        if not oclc_number_data:
            return

        # Look up every work referenced in that document and yield its data.
        graph = OCLCLinkedData.graph(oclc_number_data)
        works = OCLCLinkedData.extract_works(graph)
        for work_uri in works:
            m = self.URI_WITH_OCLC_WORK_ID.match(work_uri)
            if m:
                work_id = m.groups()[0]
                identifier, was_new = Identifier.for_foreign_id(
                    self._db, Identifier.OCLC_WORK, work_id)

                oclc_work_data, cached = self.lookup_by_identifier(
                    identifier, processed_uris)
                yield oclc_work_data
Esempio n. 10
0
    def lookup_info_to_metadata(self, lookup_representation):
        """Transforms a NoveList JSON representation into a Metadata object"""

        if not lookup_representation.content:
            return None

        lookup_info = json.loads(lookup_representation.content)
        book_info = lookup_info["TitleInfo"]
        if book_info:
            novelist_identifier = book_info.get("ui")
        if not book_info or not novelist_identifier:
            # NoveList didn't know the ISBN.
            return None

        primary_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, novelist_identifier
        )
        metadata = Metadata(self.source, primary_identifier=primary_identifier)

        # Get the equivalent ISBN identifiers.
        metadata.identifiers += self._extract_isbns(book_info)

        author = book_info.get("author")
        if author:
            metadata.contributors.append(ContributorData(sort_name=author))

        description = book_info.get("description")
        if description:
            metadata.links.append(
                LinkData(
                    rel=Hyperlink.DESCRIPTION,
                    content=description,
                    media_type=Representation.TEXT_PLAIN,
                )
            )

        audience_level = book_info.get("audience_level")
        if audience_level:
            metadata.subjects.append(
                SubjectData(Subject.FREEFORM_AUDIENCE, audience_level)
            )

        novelist_rating = book_info.get("rating")
        if novelist_rating:
            metadata.measurements.append(
                MeasurementData(Measurement.RATING, novelist_rating)
            )

        # Extract feature content if it is available.
        series_info = None
        appeals_info = None
        lexile_info = None
        goodreads_info = None
        recommendations_info = None
        feature_content = lookup_info.get("FeatureContent")
        if feature_content:
            series_info = feature_content.get("SeriesInfo")
            appeals_info = feature_content.get("Appeals")
            lexile_info = feature_content.get("LexileInfo")
            goodreads_info = feature_content.get("GoodReads")
            recommendations_info = feature_content.get("SimilarTitles")

        metadata, title_key = self.get_series_information(
            metadata, series_info, book_info
        )
        metadata.title = book_info.get(title_key)
        subtitle = TitleProcessor.extract_subtitle(
            metadata.title, book_info.get("full_title")
        )
        metadata.subtitle = self._scrub_subtitle(subtitle)

        # TODO: How well do we trust this data? We could conceivably bump up
        # the weight here.
        if appeals_info:
            extracted_genres = False
            for appeal in appeals_info:
                genres = appeal.get("genres")
                if genres:
                    for genre in genres:
                        metadata.subjects.append(
                            SubjectData(Subject.TAG, genre["Name"])
                        )
                        extracted_genres = True
                if extracted_genres:
                    break

        if lexile_info:
            metadata.subjects.append(
                SubjectData(Subject.LEXILE_SCORE, lexile_info["Lexile"])
            )

        if goodreads_info:
            metadata.measurements.append(
                MeasurementData(Measurement.RATING, goodreads_info["average_rating"])
            )

        metadata = self.get_recommendations(metadata, recommendations_info)

        # If nothing interesting comes from the API, ignore it.
        if not (
            metadata.measurements
            or metadata.series_position
            or metadata.series
            or metadata.subjects
            or metadata.links
            or metadata.subtitle
            or metadata.recommendations
        ):
            metadata = None
        return metadata
Esempio n. 11
0
class OCLCXMLParser(XMLParser):

    # OCLC in-representation 'status codes'
    SINGLE_WORK_SUMMARY_STATUS = 0
    SINGLE_WORK_DETAIL_STATUS = 2
    MULTI_WORK_STATUS = 4
    NO_INPUT_STATUS = 100
    INVALID_INPUT_STATUS = 101
    NOT_FOUND_STATUS = 102
    UNEXPECTED_ERROR_STATUS = 200

    INTS = set([OCLC.HOLDING_COUNT, OCLC.EDITION_COUNT])

    NAMESPACES = {'oclc': 'http://classify.oclc.org'}

    LIST_TYPE = "works"
    log = logging.getLogger("OCLC XML Parser")

    @classmethod
    def parse(cls, _db, xml, **restrictions):
        """Turn XML data from the OCLC lookup service into a list of SWIDs
        (for a multi-work response) or a list of Edition
        objects (for a single-work response).
        """
        tree = etree.fromstring(xml, parser=etree.XMLParser(recover=True))
        response = cls._xpath1(tree, "oclc:response")
        representation_type = int(response.get('code'))

        workset_record = None
        editions = []
        edition_records = []

        if representation_type == cls.UNEXPECTED_ERROR_STATUS:
            raise IOError("Unexpected error from OCLC API: %s" % xml)
        elif representation_type in (cls.NO_INPUT_STATUS,
                                     cls.INVALID_INPUT_STATUS):
            return representation_type, []
        elif representation_type == cls.SINGLE_WORK_SUMMARY_STATUS:
            raise IOError(
                "Got single-work summary from OCLC despite requesting detail: %s"
                % xml)

        # The real action happens here.
        if representation_type == cls.SINGLE_WORK_DETAIL_STATUS:
            authors_tag = cls._xpath1(tree, "//oclc:authors")

            work_tag = cls._xpath1(tree, "//oclc:work")
            if work_tag is not None:
                author_string = work_tag.get('author')
                primary_author = cls.primary_author_from_author_string(
                    _db, author_string)

            existing_authors = cls.extract_authors(
                _db, authors_tag, primary_author=primary_author)

            # The representation lists a single work, its authors, its editions,
            # plus summary classification information for the work.
            edition, ignore = cls.extract_edition(_db, work_tag,
                                                  existing_authors,
                                                  **restrictions)
            if edition:
                cls.log.info("EXTRACTED %r", edition)
            records = []
            if edition:
                records.append(edition)
            else:
                # The work record itself failed one of the
                # restrictions. None of its editions are likely to
                # succeed either.
                return representation_type, records

        elif representation_type == cls.MULTI_WORK_STATUS:
            # The representation lists a set of works that match the
            # search query.
            cls.log.debug("Extracting SWIDs from search results.")
            records = cls.extract_swids(_db, tree, **restrictions)
        elif representation_type == cls.NOT_FOUND_STATUS:
            # No problem; OCLC just doesn't have any data.
            records = []
        else:
            raise IOError("Unrecognized status code from OCLC API: %s (%s)" %
                          (representation_type, xml))

        return representation_type, records

    @classmethod
    def extract_swids(cls, _db, tree, **restrictions):
        """Turn a multi-work response into a list of SWIDs."""

        swids = []
        for work_tag in cls._xpath(tree, "//oclc:work"):
            # We're not calling extract_basic_info because we care about
            # the info, we're calling it to make sure this work meets
            # the restriction. If this work meets the restriction,
            # we'll store its info when we look up the SWID.
            response = cls._extract_basic_info(_db, work_tag, **restrictions)
            if response:
                title, author_names, language = response
                # TODO: 'swid' is what it's called in older representations.
                # That code can be removed once we replace all representations.
                work_identifier = work_tag.get('wi') or work_tag.get('swid')
                cls.log.debug("WORK ID %s (%s, %r, %s)", work_identifier,
                              title, author_names, language)
                swids.append(work_identifier)
        return swids

    ROLES = re.compile("\[([^]]+)\]$")
    LIFESPAN = re.compile("([0-9]+)-([0-9]*)[.;]?$")

    @classmethod
    def extract_authors(cls, _db, authors_tag, primary_author=None):
        results = []
        if authors_tag is not None:
            for author_tag in cls._xpath(authors_tag, "//oclc:author"):
                lc = author_tag.get('lc', None)
                viaf = author_tag.get('viaf', None)
                contributor, roles, default_role_used = cls._parse_single_author(
                    _db,
                    author_tag.text,
                    lc=lc,
                    viaf=viaf,
                    primary_author=primary_author)
                if contributor:
                    results.append(contributor)

        return results

    @classmethod
    def _contributor_match(cls, contributor, name, lc, viaf):
        return (contributor.sort_name == name
                and (lc is None or contributor.lc == lc)
                and (viaf is None or contributor.viaf == viaf))

    @classmethod
    def _parse_single_author(cls,
                             _db,
                             author,
                             lc=None,
                             viaf=None,
                             existing_authors=[],
                             default_role=Contributor.AUTHOR_ROLE,
                             primary_author=None):
        default_role_used = False
        # First find roles if present
        # "Giles, Lionel, 1875-1958 [Writer of added commentary; Translator]"
        author = author.strip()
        m = cls.ROLES.search(author)
        if m:
            author = author[:m.start()].strip()
            role_string = m.groups()[0]
            roles = [x.strip() for x in role_string.split(";")]
        elif default_role:
            roles = [default_role]
            default_role_used = True
        else:
            roles = []

        # Author string now looks like
        # "Giles, Lionel, 1875-1958"
        m = cls.LIFESPAN.search(author)
        kwargs = dict()
        if m:
            author = author[:m.start()].strip()
            birth, death = m.groups()
            if birth:
                kwargs[Contributor.BIRTH_DATE] = birth
            if death:
                kwargs[Contributor.DEATH_DATE] = death

        # Author string now looks like
        # "Giles, Lionel,"
        if author.endswith(","):
            author = author[:-1]

        contributor = None
        if not author:
            # No name was given for the author.
            return None, roles, default_role_used

        if primary_author and author == primary_author.sort_name:
            if Contributor.AUTHOR_ROLE in roles:
                roles.remove(Contributor.AUTHOR_ROLE)
            if Contributor.UNKNOWN_ROLE in roles:
                roles.remove(Contributor.UNKNOWN_ROLE)
            roles.insert(0, Contributor.PRIMARY_AUTHOR_ROLE)

        if existing_authors:
            # Calling Contributor.lookup will result in a database
            # hit, and looking up a contributor based on name may
            # result in multiple results (see below). We'll have no
            # way of distinguishing between those results. If
            # possible, it's much more reliable to look through
            # existing_authors (the authors derived from an entry's
            # <authors> tag).
            for x in existing_authors:
                if cls._contributor_match(x, author, lc, viaf):
                    contributor = x
                    break
            if contributor:
                was_new = False

        if not contributor:
            contributor, was_new = Contributor.lookup(_db,
                                                      author,
                                                      viaf,
                                                      lc,
                                                      extra=kwargs)
        if isinstance(contributor, list):
            # We asked for an author based solely on the name, which makes
            # Contributor.lookup() return a list.
            if len(contributor) == 1:
                # Fortunately, either the database knows about only
                # one author with that name, or it didn't know about
                # any authors with that name and it just created one,
                # so we can unambiguously use it.
                contributor = contributor[0]
            else:
                # Uh-oh. The database knows about multiple authors
                # with that name.  We have no basis for deciding which
                # author we mean. But we would prefer to identify with
                # an author who has a known LC or VIAF number.
                #
                # This should happen very rarely because of our check
                # against existing_authors above. But it will happen
                # for authors that have a work in Project Gutenberg.
                with_id = [
                    x for x in contributor
                    if x.lc is not None or x.viaf is not None
                ]
                if with_id:
                    contributor = with_id[0]
                else:
                    contributor = contributor[0]
        return contributor, roles, default_role_used

    @classmethod
    def primary_author_from_author_string(cls, _db, author_string):
        # If the first author mentioned in the author string
        # does not have an explicit role set, treat them as the primary
        # author.
        if not author_string:
            return None
        authors = author_string.split("|")
        if not authors:
            return None
        author, roles, default_role_used = cls._parse_single_author(
            _db, authors[0], default_role=Contributor.PRIMARY_AUTHOR_ROLE)
        if roles == [Contributor.PRIMARY_AUTHOR_ROLE]:
            return author
        return None

    @classmethod
    def parse_author_string(cls,
                            _db,
                            author_string,
                            existing_authors=[],
                            primary_author=None):
        default_role = Contributor.PRIMARY_AUTHOR_ROLE
        authors = []
        if not author_string:
            return authors
        for author in author_string.split("|"):
            author, roles, default_role_used = cls._parse_single_author(
                _db,
                author,
                existing_authors=existing_authors,
                default_role=default_role,
                primary_author=primary_author)
            if roles:
                if Contributor.PRIMARY_AUTHOR_ROLE in roles:
                    # That was the primary author.  If we see someone
                    # with no explicit role after this point, they're
                    # just a regular author.
                    default_role = Contributor.AUTHOR_ROLE
                elif not default_role_used:
                    # We're dealing with someone whose role was
                    # explicitly specified. If we see someone with no
                    # explicit role after this point, it's probably
                    # because their role is so minor as to not be
                    # worth mentioning, not because it's so major that
                    # we can assume they're an author.
                    default_role = Contributor.UNKNOWN_ROLE
            roles = roles or [default_role]
            if author:
                authors.append((author, roles))
        return authors

    @classmethod
    def _extract_basic_info(cls,
                            _db,
                            tag,
                            existing_authors=None,
                            **restrictions):
        """Extract information common to work tag and edition tag."""
        title = tag.get('title')
        author_string = tag.get('author')
        authors_and_roles = cls.parse_author_string(_db, author_string,
                                                    existing_authors)
        if 'language' in tag.keys():
            language = tag.get('language')
        else:
            language = None

        if title and 'title' in restrictions:
            must_resemble_title = restrictions['title']
            threshold = restrictions.get('title_similarity', 0.25)
            similarity = MetadataSimilarity.title_similarity(
                must_resemble_title, title)
            if similarity < threshold:
                # The title of the book under consideration is not
                # similar enough to the given title.
                cls.log.debug("FAILURE TO RESEMBLE: %s vs %s (%.2f)", title,
                              must_resemble_title, similarity)
                return None

            # The semicolon is frequently used to separate multiple
            # works in an anthology. If there is no semicolon in the
            # original title, do not consider titles that contain
            # semicolons.
            if (not ' ; ' in must_resemble_title and ' ; ' in title
                    and threshold > 0):
                cls.log.debug("SEMICOLON DISQUALIFICATION: %s", title)
                return None

        # Apply restrictions. If they're not met, return None.
        if 'language' in restrictions and language:
            # We know which language this record is for. Match it
            # against the language used in the Edition we're
            # matching against.
            restrict_to_language = set(restrictions['language'])
            if language != restrict_to_language:
                # This record is for a book in a different language
                cls.log.debug("WRONG LANGUAGE: %s", language)
                return None

        if 'authors' in restrictions:
            restrict_to_authors = restrictions['authors']
            if restrict_to_authors and isinstance(restrict_to_authors[0],
                                                  Contributor):
                restrict_to_authors = [
                    x.sort_name for x in restrict_to_authors
                ]
            primary_author = None

            for a, roles in authors_and_roles:
                if Contributor.PRIMARY_AUTHOR_ROLE in roles:
                    primary_author = a
                    break
            if (not primary_author or
                (primary_author not in restrict_to_authors
                 and primary_author.sort_name not in restrict_to_authors)):
                # None of the given authors showed up as the
                # primary author of this book. They may have had
                # some other role in it, or the book may be about
                # them, or incorporate their work, but this book
                # is not *by* them.
                return None

        author_names = ", ".join([x.sort_name for x, y in authors_and_roles])

        return title, authors_and_roles, language

    UNUSED_MEDIA = set([
        "itemtype-intmm",
        "itemtype-msscr",
        "itemtype-artchap-artcl",
        "itemtype-jrnl",
        "itemtype-map",
        "itemtype-vis",
        "itemtype-jrnl-digital",
        "itemtype-image-2d",
        "itemtype-artchap-digital",
        "itemtype-intmm-digital",
        "itemtype-archv",
        "itemtype-msscr-digital",
        "itemtype-game",
        "itemtype-web-digital",
        "itemtype-map-digital",
    ])

    @classmethod
    def extract_edition(cls, _db, work_tag, existing_authors, **restrictions):
        """Create a new Edition object with information about a
        work (identified by OCLC Work ID).
        """
        # TODO: 'pswid' is what it's called in older representations.
        # That code can be removed once we replace all representations.
        oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid'))
        # if oclc_work_id:
        #     print " owi: %s" % oclc_work_id
        # else:
        #     print " No owi in %s" % etree.tostring(work_tag)

        if not oclc_work_id:
            raise ValueError("Work has no owi")

        item_type = work_tag.get("itemtype")
        if (item_type.startswith('itemtype-book')
                or item_type.startswith('itemtype-compfile')):
            medium = Edition.BOOK_MEDIUM
        elif item_type.startswith('itemtype-audiobook'
                                  ) or item_type.startswith('itemtype-music'):
            # Pretty much all Gutenberg texts, even the audio texts,
            # are based on a book, and the ones that aren't
            # (recordings of individual songs) probably aren't in OCLC
            # anyway. So we just want to get the books.
            medium = Edition.AUDIO_MEDIUM
            medium = None
        elif item_type.startswith('itemtype-video'):
            #medium = Edition.VIDEO_MEDIUM
            medium = None
        elif item_type in cls.UNUSED_MEDIA:
            medium = None
        else:
            medium = None

        # Only create Editions for books with a recognized medium
        if medium is None:
            return None, False

        result = cls._extract_basic_info(_db, work_tag, existing_authors,
                                         **restrictions)
        if not result:
            # This record did not meet one of the restrictions.
            return None, False

        title, authors_and_roles, language = result

        # Record some extra OCLC-specific information
        editions = work_tag.get('editions')
        holdings = work_tag.get('holdings')

        # Get an identifier for this work.
        identifier, ignore = Identifier.for_foreign_id(_db,
                                                       Identifier.OCLC_WORK,
                                                       oclc_work_id)

        data_source = DataSource.lookup(_db, DataSource.OCLC)
        identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings)
        identifier.add_measurement(data_source, Measurement.PUBLISHED_EDITIONS,
                                   editions)

        # Create a Edition for source + identifier
        edition, new = get_one_or_create(_db,
                                         Edition,
                                         data_source=data_source,
                                         primary_identifier=identifier,
                                         create_method_kwargs=dict(
                                             title=title,
                                             language=language,
                                         ))

        # Get the most popular Dewey and LCC classification for this
        # work.
        for tag_name, subject_type in (("ddc", Subject.DDC), ("lcc",
                                                              Subject.LCC)):
            tag = cls._xpath1(work_tag,
                              "//oclc:%s/oclc:mostPopular" % tag_name)
            if tag is not None:
                id = tag.get('nsfa') or tag.get('sfa')
                weight = int(tag.get('holdings'))
                identifier.classify(data_source,
                                    subject_type,
                                    id,
                                    weight=weight)

        # Find FAST subjects for the work.
        for heading in cls._xpath(work_tag, "//oclc:fast//oclc:heading"):
            id = heading.get('ident')
            weight = int(heading.get('heldby'))
            value = heading.text
            identifier.classify(data_source, Subject.FAST, id, value, weight)

        # Associate the authors with the Edition.
        for contributor, roles in authors_and_roles:
            edition.add_contributor(contributor, roles)
        return edition, new

    @classmethod
    def extract_edition_record(cls, _db, edition_tag, existing_authors,
                               **restrictions):
        """Create a new Edition object with information about an
        edition of a book (identified by OCLC Number).
        """
        oclc_number = unicode(edition_tag.get('oclc'))
        try:
            int(oclc_number)
        except ValueError, e:
            # This record does not have a valid OCLC number.
            return None, False

        # Fill in some basic information about this new record.
        result = cls._extract_basic_info(_db, edition_tag, existing_authors,
                                         **restrictions)
        if not result:
            # This record did not meet one of the restrictions.
            return None, False

        title, authors_and_roles, language = result

        # Add a couple extra bits of OCLC-specific information.
        extra = {
            OCLC.HOLDING_COUNT: edition_tag.get('holdings'),
            OCLC.FORMAT: edition_tag.get('itemtype'),
        }

        # Get an identifier for this edition.
        identifier, ignore = Identifier.for_foreign_id(_db,
                                                       Identifier.OCLC_NUMBER,
                                                       oclc_number)

        # Create a Edition for source + identifier
        data_source = DataSource.lookup(_db, DataSource.OCLC)
        edition_record, new = get_one_or_create(_db,
                                                Edition,
                                                data_source=data_source,
                                                primary_identifier=identifier,
                                                create_method_kwargs=dict(
                                                    title=title,
                                                    language=language,
                                                    subjects=subjects,
                                                    extra=extra,
                                                ))

        subjects = {}
        for subject_type, oclc_code in ((Subject.LCC, "050"), (Subject.DDC,
                                                               "082")):
            classification = cls._xpath1(
                edition_tag,
                "oclc:classifications/oclc:class[@tag=%s]" % oclc_code)
            if classification is not None:
                value = classification.get("nsfa") or classification.get('sfa')
                identifier.classify(data_source, subject_type, value)

        # Associated each contributor with the new record.
        for author, roles in authors_and_roles:
            edition_record.add_contributor(author, roles)
        return edition_record, new
Esempio n. 12
0
    def extract_edition(cls, _db, work_tag, existing_authors, **restrictions):
        """Create a new Edition object with information about a
        work (identified by OCLC Work ID).
        """
        # TODO: 'pswid' is what it's called in older representations.
        # That code can be removed once we replace all representations.
        oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid'))
        # if oclc_work_id:
        #     print " owi: %s" % oclc_work_id
        # else:
        #     print " No owi in %s" % etree.tostring(work_tag)

        if not oclc_work_id:
            raise ValueError("Work has no owi")

        item_type = work_tag.get("itemtype")
        if (item_type.startswith('itemtype-book')
                or item_type.startswith('itemtype-compfile')):
            medium = Edition.BOOK_MEDIUM
        elif item_type.startswith('itemtype-audiobook'
                                  ) or item_type.startswith('itemtype-music'):
            # Pretty much all Gutenberg texts, even the audio texts,
            # are based on a book, and the ones that aren't
            # (recordings of individual songs) probably aren't in OCLC
            # anyway. So we just want to get the books.
            medium = Edition.AUDIO_MEDIUM
            medium = None
        elif item_type.startswith('itemtype-video'):
            #medium = Edition.VIDEO_MEDIUM
            medium = None
        elif item_type in cls.UNUSED_MEDIA:
            medium = None
        else:
            medium = None

        # Only create Editions for books with a recognized medium
        if medium is None:
            return None, False

        result = cls._extract_basic_info(_db, work_tag, existing_authors,
                                         **restrictions)
        if not result:
            # This record did not meet one of the restrictions.
            return None, False

        title, authors_and_roles, language = result

        # Record some extra OCLC-specific information
        editions = work_tag.get('editions')
        holdings = work_tag.get('holdings')

        # Get an identifier for this work.
        identifier, ignore = Identifier.for_foreign_id(_db,
                                                       Identifier.OCLC_WORK,
                                                       oclc_work_id)

        data_source = DataSource.lookup(_db, DataSource.OCLC)
        identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings)
        identifier.add_measurement(data_source, Measurement.PUBLISHED_EDITIONS,
                                   editions)

        # Create a Edition for source + identifier
        edition, new = get_one_or_create(_db,
                                         Edition,
                                         data_source=data_source,
                                         primary_identifier=identifier,
                                         create_method_kwargs=dict(
                                             title=title,
                                             language=language,
                                         ))

        # Get the most popular Dewey and LCC classification for this
        # work.
        for tag_name, subject_type in (("ddc", Subject.DDC), ("lcc",
                                                              Subject.LCC)):
            tag = cls._xpath1(work_tag,
                              "//oclc:%s/oclc:mostPopular" % tag_name)
            if tag is not None:
                id = tag.get('nsfa') or tag.get('sfa')
                weight = int(tag.get('holdings'))
                identifier.classify(data_source,
                                    subject_type,
                                    id,
                                    weight=weight)

        # Find FAST subjects for the work.
        for heading in cls._xpath(work_tag, "//oclc:fast//oclc:heading"):
            id = heading.get('ident')
            weight = int(heading.get('heldby'))
            value = heading.text
            identifier.classify(data_source, Subject.FAST, id, value, weight)

        # Associate the authors with the Edition.
        for contributor, roles in authors_and_roles:
            edition.add_contributor(contributor, roles)
        return edition, new
Esempio n. 13
0
 def run(self):
     id_type, identifier = sys.argv[1:]
     identifier, ignore = Identifier.for_foreign_id(
         self._db, id_type, identifier
     )
     self.fix_identifier(identifier)
Esempio n. 14
0
    def extract_edition(cls, _db, work_tag, existing_authors, **restrictions):
        """Create a new Edition object with information about a
        work (identified by OCLC Work ID).
        """
        # TODO: 'pswid' is what it's called in older representations.
        # That code can be removed once we replace all representations.
        oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid'))
        # if oclc_work_id:
        #     print " owi: %s" % oclc_work_id
        # else:
        #     print " No owi in %s" % etree.tostring(work_tag)


        if not oclc_work_id:
            raise ValueError("Work has no owi")

        item_type = work_tag.get("itemtype")
        if (item_type.startswith('itemtype-book')
            or item_type.startswith('itemtype-compfile')):
            medium = Edition.BOOK_MEDIUM
        elif item_type.startswith('itemtype-audiobook') or item_type.startswith('itemtype-music'):
            # Pretty much all Gutenberg texts, even the audio texts,
            # are based on a book, and the ones that aren't
            # (recordings of individual songs) probably aren't in OCLC
            # anyway. So we just want to get the books.
            medium = Edition.AUDIO_MEDIUM
            medium = None
        elif item_type.startswith('itemtype-video'):
            #medium = Edition.VIDEO_MEDIUM
            medium = None
        elif item_type in cls.UNUSED_MEDIA:
            medium = None
        else:
            medium = None

        # Only create Editions for books with a recognized medium
        if medium is None:
            return None, False

        result = cls._extract_basic_info(_db, work_tag, existing_authors, **restrictions)
        if not result:
            # This record did not meet one of the restrictions.
            return None, False

        title, authors_and_roles, language = result

        # Record some extra OCLC-specific information
        editions = work_tag.get('editions')
        holdings = work_tag.get('holdings')

        # Get an identifier for this work.
        identifier, ignore = Identifier.for_foreign_id(
            _db, Identifier.OCLC_WORK, oclc_work_id
        )

        data_source = DataSource.lookup(_db, DataSource.OCLC)
        identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings)
        identifier.add_measurement(
            data_source, Measurement.PUBLISHED_EDITIONS, editions)


        # Create a Edition for source + identifier
        edition, new = get_one_or_create(
            _db, Edition,
            data_source=data_source,
            primary_identifier=identifier,
            create_method_kwargs=dict(
                title=title,
                language=language,
            )
        )

        # Get the most popular Dewey and LCC classification for this
        # work.
        for tag_name, subject_type in (
                ("ddc", Subject.DDC),
                ("lcc", Subject.LCC)):
            tag = cls._xpath1(
                work_tag,
                "//oclc:%s/oclc:mostPopular" % tag_name)
            if tag is not None:
                id = tag.get('nsfa') or tag.get('sfa')
                weight = int(tag.get('holdings'))
                identifier.classify(
                    data_source, subject_type, id, weight=weight)

        # Find FAST subjects for the work.
        for heading in cls._xpath(
                work_tag, "//oclc:fast//oclc:heading"):
            id = heading.get('ident')
            weight = int(heading.get('heldby'))
            value = heading.text
            identifier.classify(
                data_source, Subject.FAST, id, value, weight)

        # Associate the authors with the Edition.
        for contributor, roles in authors_and_roles:
            edition.add_contributor(contributor, roles)
        return edition, new
Esempio n. 15
0
    def lookup_info_to_metadata(self, lookup_representation):
        """Transforms a NoveList JSON representation into a Metadata object"""

        if not lookup_representation.content:
            return None

        lookup_info = json.loads(lookup_representation.content)
        book_info = lookup_info['TitleInfo']
        if book_info:
            novelist_identifier = book_info.get('ui')
        if not book_info or not novelist_identifier:
            # NoveList didn't know the ISBN.
            return None

        primary_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, novelist_identifier
        )
        metadata = Metadata(self.source, primary_identifier=primary_identifier)

        # Get the equivalent ISBN identifiers.
        metadata.identifiers += self._extract_isbns(book_info)

        author = book_info.get('author')
        if author:
            metadata.contributors.append(ContributorData(sort_name=author))

        description = book_info.get('description')
        if description:
            metadata.links.append(LinkData(
                rel=Hyperlink.DESCRIPTION, content=description,
                media_type=Representation.TEXT_PLAIN
            ))

        audience_level = book_info.get('audience_level')
        if audience_level:
            metadata.subjects.append(SubjectData(
                Subject.FREEFORM_AUDIENCE, audience_level
            ))

        novelist_rating = book_info.get('rating')
        if novelist_rating:
            metadata.measurements.append(MeasurementData(
                Measurement.RATING, novelist_rating
            ))

        # Extract feature content if it is available.
        series_info = None
        appeals_info = None
        lexile_info = None
        goodreads_info = None
        recommendations_info = None
        feature_content = lookup_info.get('FeatureContent')
        if feature_content:
            series_info = feature_content.get('SeriesInfo')
            appeals_info = feature_content.get('Appeals')
            lexile_info = feature_content.get('LexileInfo')
            goodreads_info = feature_content.get('GoodReads')
            recommendations_info = feature_content.get('SimilarTitles')

        metadata, title_key = self.get_series_information(
            metadata, series_info, book_info
        )
        metadata.title = book_info.get(title_key)
        subtitle = TitleProcessor.extract_subtitle(
            metadata.title, book_info.get('full_title')
        )
        metadata.subtitle = self._scrub_subtitle(subtitle)

        if appeals_info:
            extracted_genres = False
            for appeal in appeals_info:
                genres = appeal.get('genres')
                if genres:
                    for genre in genres:
                        metadata.subjects.append(SubjectData(
                            Subject.TAG, genre['Name']
                        ))
                        extracted_genres = True
                if extracted_genres:
                    break

        if lexile_info:
            metadata.subjects.append(SubjectData(
                Subject.LEXILE_SCORE, lexile_info['Lexile']
            ))

        if goodreads_info:
            metadata.measurements.append(MeasurementData(
                Measurement.RATING, goodreads_info['average_rating']
            ))

        metadata = self.get_recommendations(metadata, recommendations_info)

        # If nothing interesting comes from the API, ignore it.
        if not (metadata.measurements or metadata.series_position or
            metadata.series or metadata.subjects or metadata.links or
            metadata.subtitle or metadata.recommendations
        ):
            metadata = None
        return metadata
Esempio n. 16
0
        # by now we can assume response is either empty or a list
        for item in resp_obj:
            # go through patron's checkouts and generate LoanInfo objects, 
            # with FulfillmentInfo objects included
            media_type = item.get('mediaType', 'eBook')
            isbn = item.get('isbn', None)
            can_renew = item.get('canRenew', None)
            title = item.get('title', None)
            authors = item.get('authors', None)
            # refers to checkout expiration date, not the downloadUrl's
            expires = item.get('expiration', None)
            if expires:
                expires = datetime.datetime.strptime(expires, self.EXPIRATION_DATE_FORMAT).date()

            identifier, made_new = Identifier.for_foreign_id(self._db, 
                    foreign_identifier_type=Identifier.RB_DIGITAL_ID, 
                    foreign_id=isbn, autocreate=False)

            # Note: if OneClick knows about a patron's checked-out item that wasn't
            # checked out through us, we ignore it
            if not identifier:
                continue

            files = item.get('files', None)
            for file in files:
                filename = file.get('filename', None)
                # assume fileFormat is same for all files associated with this checkout
                # and use the last one mentioned.  Ex: "fileFormat": "EPUB".
                # note: audio books don't list fileFormat field, just the filename, and the mediaType.
                file_format = file.get('fileFormat', None)
                if file_format == 'EPUB':
Esempio n. 17
0
        # by now we can assume response is either empty or a list
        for item in resp_obj:
            # go through patron's checkouts and generate LoanInfo objects, 
            # with FulfillmentInfo objects included
            media_type = item.get('mediaType', 'eBook')
            isbn = item.get('isbn', None)
            can_renew = item.get('canRenew', None)
            title = item.get('title', None)
            authors = item.get('authors', None)
            # refers to checkout expiration date, not the downloadUrl's
            expires = item.get('expiration', None)
            if expires:
                expires = datetime.datetime.strptime(expires, self.EXPIRATION_DATE_FORMAT).date()

            identifier, made_new = Identifier.for_foreign_id(self._db, 
                    foreign_identifier_type=Identifier.ONECLICK_ID, 
                    foreign_id=isbn, autocreate=False)

            # Note: if OneClick knows about a patron's checked-out item that wasn't
            # checked out through us, we ignore it
            if not identifier:
                continue

            files = item.get('files', None)
            for file in files:
                filename = file.get('filename', None)
                # assume fileFormat is same for all files associated with this checkout
                # and use the last one mentioned.  Ex: "fileFormat": "EPUB".
                # note: audio books don't list fileFormat field, just the filename, and the mediaType.
                file_format = file.get('fileFormat', None)
                if file_format == 'EPUB':
Esempio n. 18
0
    def test_recursively_equivalent_identifiers(self):

        # We start with a Gutenberg book.
        gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG)
        record, ignore = Edition.for_foreign_id(self._db, gutenberg,
                                                Identifier.GUTENBERG_ID, "100")
        gutenberg_id = record.primary_identifier

        # We use OCLC Classify to do a title/author lookup.
        oclc = DataSource.lookup(self._db, DataSource.OCLC)
        search_id, ignore = Identifier.for_foreign_id(self._db,
                                                      Identifier.OCLC_WORK,
                                                      "60010")
        gutenberg_id.equivalent_to(oclc, search_id, 1)

        # The title/author lookup associates the search term with two
        # different OCLC Numbers.
        oclc_id, ignore = Identifier.for_foreign_id(self._db,
                                                    Identifier.OCLC_NUMBER,
                                                    "9999")
        oclc_id_2, ignore = Identifier.for_foreign_id(self._db,
                                                      Identifier.OCLC_NUMBER,
                                                      "1000")

        search_id.equivalent_to(oclc, oclc_id, 1)
        search_id.equivalent_to(oclc, oclc_id_2, 1)

        # We then use OCLC Linked Data to connect one of the OCLC
        # Numbers with an ISBN.
        linked_data = DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)
        isbn_id, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN,
                                                    "900100434X")
        oclc_id.equivalent_to(linked_data, isbn_id, 1)

        # As it turns out, we have an Overdrive work record...
        overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE)
        overdrive_record, ignore = Edition.for_foreign_id(
            self._db, overdrive, Identifier.OVERDRIVE_ID, "{111-222}")
        overdrive_id = overdrive_record.primary_identifier

        # ...which is tied (by Overdrive) to the same ISBN.
        overdrive_id.equivalent_to(overdrive, isbn_id, 1)

        # Finally, here's a completely unrelated Edition, which
        # will not be showing up.
        gutenberg2, ignore = Edition.for_foreign_id(self._db, gutenberg,
                                                    Identifier.GUTENBERG_ID,
                                                    "200")
        gutenberg2.title = "Unrelated Gutenberg record."

        levels = [
            record.equivalent_identifiers(policy=PresentationCalculationPolicy(
                equivalent_identifier_levels=i)) for i in range(0, 5)
        ]

        # At level 0, the only identifier found is the Gutenberg ID.
        assert set([gutenberg_id]) == set(levels[0])

        # At level 1, we pick up the title/author lookup.
        assert set([gutenberg_id, search_id]) == set(levels[1])

        # At level 2, we pick up the title/author lookup and the two
        # OCLC Numbers.
        assert set([gutenberg_id, search_id, oclc_id,
                    oclc_id_2]) == set(levels[2])

        # At level 3, we also pick up the ISBN.
        assert set([gutenberg_id, search_id, oclc_id, oclc_id_2,
                    isbn_id]) == set(levels[3])

        # At level 4, the recursion starts to go in the other
        # direction: we pick up the Overdrive ID that's equivalent to
        # the same ISBN as the OCLC Number.
        assert set([
            gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id, overdrive_id
        ]) == set(levels[4])