Esempio n. 1
0
    def test_circulationdata_can_be_deepcopied(self):
        # Check that we didn't put something in the CirculationData that
        # will prevent it from being copied. (e.g., self.log)

        subject = SubjectData(Subject.TAG, "subject")
        contributor = ContributorData()
        identifier = IdentifierData(Identifier.GUTENBERG_ID, "1")
        link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub")
        format = FormatData(Representation.EPUB_MEDIA_TYPE,
                            DeliveryMechanism.NO_DRM)
        rights_uri = RightsStatus.GENERIC_OPEN_ACCESS

        circulation_data = CirculationData(
            DataSource.GUTENBERG,
            primary_identifier=identifier,
            links=[link],
            licenses_owned=5,
            licenses_available=5,
            licenses_reserved=None,
            patrons_in_hold_queue=None,
            formats=[format],
            default_rights_uri=rights_uri,
        )

        circulation_data_copy = deepcopy(circulation_data)

        # If deepcopy didn't throw an exception we're ok.
        assert circulation_data_copy is not None
Esempio n. 2
0
    def lookup_info_to_metadata(self, lookup_representation):
        """Transforms a NoveList JSON representation into a Metadata object"""

        if not lookup_representation.content:
            return None

        lookup_info = json.loads(lookup_representation.content)
        book_info = lookup_info['TitleInfo']
        if book_info:
            novelist_identifier = book_info.get('ui')
        if not book_info or not novelist_identifier:
            # NoveList didn't know the ISBN.
            return None

        primary_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, novelist_identifier)
        metadata = Metadata(self.source, primary_identifier=primary_identifier)

        # Get the equivalent ISBN identifiers.
        metadata.identifiers += self._extract_isbns(book_info)

        author = book_info.get('author')
        if author:
            metadata.contributors.append(ContributorData(sort_name=author))

        description = book_info.get('description')
        if description:
            metadata.links.append(
                LinkData(rel=Hyperlink.DESCRIPTION,
                         content=description,
                         media_type=Representation.TEXT_PLAIN))

        audience_level = book_info.get('audience_level')
        if audience_level:
            metadata.subjects.append(
                SubjectData(Subject.FREEFORM_AUDIENCE, audience_level))

        novelist_rating = book_info.get('rating')
        if novelist_rating:
            metadata.measurements.append(
                MeasurementData(Measurement.RATING, novelist_rating))

        # Extract feature content if it is available.
        series_info = None
        appeals_info = None
        lexile_info = None
        goodreads_info = None
        recommendations_info = None
        feature_content = lookup_info.get('FeatureContent')
        if feature_content:
            series_info = feature_content.get('SeriesInfo')
            appeals_info = feature_content.get('Appeals')
            lexile_info = feature_content.get('LexileInfo')
            goodreads_info = feature_content.get('GoodReads')
            recommendations_info = feature_content.get('SimilarTitles')

        metadata, title_key = self.get_series_information(
            metadata, series_info, book_info)
        metadata.title = book_info.get(title_key)
        subtitle = TitleProcessor.extract_subtitle(metadata.title,
                                                   book_info.get('full_title'))
        metadata.subtitle = self._scrub_subtitle(subtitle)

        # TODO: How well do we trust this data? We could conceivably bump up
        # the weight here.
        if appeals_info:
            extracted_genres = False
            for appeal in appeals_info:
                genres = appeal.get('genres')
                if genres:
                    for genre in genres:
                        metadata.subjects.append(
                            SubjectData(Subject.TAG, genre['Name']))
                        extracted_genres = True
                if extracted_genres:
                    break

        if lexile_info:
            metadata.subjects.append(
                SubjectData(Subject.LEXILE_SCORE, lexile_info['Lexile']))

        if goodreads_info:
            metadata.measurements.append(
                MeasurementData(Measurement.RATING,
                                goodreads_info['average_rating']))

        metadata = self.get_recommendations(metadata, recommendations_info)

        # If nothing interesting comes from the API, ignore it.
        if not (metadata.measurements or metadata.series_position
                or metadata.series or metadata.subjects or metadata.links
                or metadata.subtitle or metadata.recommendations):
            metadata = None
        return metadata
Esempio n. 3
0
class TestCirculationMonitor(Axis360Test):

    BIBLIOGRAPHIC_DATA = Metadata(
        DataSource.AXIS_360,
        publisher=u'Random House Inc',
        language='eng',
        title=u'Faith of My Fathers : A Family Memoir',
        imprint=u'Random House Inc2',
        published=datetime.datetime(2000, 3, 7, 0, 0),
        primary_identifier=IdentifierData(type=Identifier.AXIS_360_ID,
                                          identifier=u'0003642860'),
        identifiers=[
            IdentifierData(type=Identifier.ISBN, identifier=u'9780375504587')
        ],
        contributors=[
            ContributorData(sort_name=u"McCain, John",
                            roles=[Contributor.PRIMARY_AUTHOR_ROLE]),
            ContributorData(sort_name=u"Salter, Mark",
                            roles=[Contributor.AUTHOR_ROLE]),
        ],
        subjects=[
            SubjectData(type=Subject.BISAC,
                        identifier=u'BIOGRAPHY & AUTOBIOGRAPHY / Political'),
            SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=u'Adult'),
        ],
    )

    AVAILABILITY_DATA = CirculationData(
        data_source=DataSource.AXIS_360,
        primary_identifier=BIBLIOGRAPHIC_DATA.primary_identifier,
        licenses_owned=9,
        licenses_available=8,
        licenses_reserved=0,
        patrons_in_hold_queue=0,
        last_checked=datetime.datetime(2015, 5, 20, 2, 9, 8),
    )

    def test_process_book(self):
        integration, ignore = create(
            self._db,
            ExternalIntegration,
            goal=ExternalIntegration.ANALYTICS_GOAL,
            protocol="core.local_analytics_provider",
        )

        monitor = Axis360CirculationMonitor(
            self._db,
            self.collection,
            api_class=MockAxis360API,
            metadata_client=MockMetadataWranglerOPDSLookup('url'))
        edition, license_pool = monitor.process_book(self.BIBLIOGRAPHIC_DATA,
                                                     self.AVAILABILITY_DATA)
        eq_(u'Faith of My Fathers : A Family Memoir', edition.title)
        eq_(u'eng', edition.language)
        eq_(u'Random House Inc', edition.publisher)
        eq_(u'Random House Inc2', edition.imprint)

        eq_(Identifier.AXIS_360_ID, edition.primary_identifier.type)
        eq_(u'0003642860', edition.primary_identifier.identifier)

        [isbn] = [
            x for x in edition.equivalent_identifiers()
            if x is not edition.primary_identifier
        ]
        eq_(Identifier.ISBN, isbn.type)
        eq_(u'9780375504587', isbn.identifier)

        eq_(
            ["McCain, John", "Salter, Mark"],
            sorted([x.sort_name for x in edition.contributors]),
        )

        subs = sorted((x.subject.type, x.subject.identifier)
                      for x in edition.primary_identifier.classifications)
        eq_([(Subject.BISAC, u'BIOGRAPHY & AUTOBIOGRAPHY / Political'),
             (Subject.FREEFORM_AUDIENCE, u'Adult')], subs)

        eq_(9, license_pool.licenses_owned)
        eq_(8, license_pool.licenses_available)
        eq_(0, license_pool.patrons_in_hold_queue)
        eq_(datetime.datetime(2015, 5, 20, 2, 9, 8), license_pool.last_checked)

        # Three circulation events were created, backdated to the
        # last_checked date of the license pool.
        events = license_pool.circulation_events
        eq_([
            u'distributor_title_add', u'distributor_check_in',
            u'distributor_license_add'
        ], [x.type for x in events])
        for e in events:
            eq_(e.start, license_pool.last_checked)

        # A presentation-ready work has been created for the LicensePool.
        work = license_pool.work
        eq_(True, work.presentation_ready)
        eq_("Faith of My Fathers : A Family Memoir", work.title)

        # A CoverageRecord has been provided for this book in the Axis
        # 360 bibliographic coverage provider, so that in the future
        # it doesn't have to make a separate API request to ask about
        # this book.
        records = [
            x for x in license_pool.identifier.coverage_records if
            x.data_source.name == DataSource.AXIS_360 and x.operation is None
        ]
        eq_(1, len(records))

    def test_process_book_updates_old_licensepool(self):
        """If the LicensePool already exists, the circulation monitor
        updates it.
        """
        edition, licensepool = self._edition(
            with_license_pool=True,
            identifier_type=Identifier.AXIS_360_ID,
            identifier_id=u'0003642860')
        # We start off with availability information based on the
        # default for test data.
        eq_(1, licensepool.licenses_owned)

        identifier = IdentifierData(
            type=licensepool.identifier.type,
            identifier=licensepool.identifier.identifier)
        metadata = Metadata(DataSource.AXIS_360, primary_identifier=identifier)
        monitor = Axis360CirculationMonitor(
            self._db,
            self.collection,
            api_class=MockAxis360API,
            metadata_client=MockMetadataWranglerOPDSLookup('url'))
        edition, licensepool = monitor.process_book(metadata,
                                                    self.AVAILABILITY_DATA)

        # Now we have information based on the CirculationData.
        eq_(9, licensepool.licenses_owned)
Esempio n. 4
0
    def extract_bibliographic(self, element):
        """Extract Metadata and CirculationData from a dictionary
        of information from Enki.

        :return: A Metadata with attached CirculationData.
        """
        # TODO: it's not clear what these are or whether we'd find them
        # useful:
        #  dateSaved
        #  length
        #  publishDate
        primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"])

        identifiers = []
        identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"]))

        contributors = []
        sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR
        contributors.append(ContributorData(sort_name=sort_name))

        links = []
        description = element.get("description")
        if description:
            links.append(
                LinkData(
                    rel=Hyperlink.DESCRIPTION,
                    content=description,
                    media_type="text/html",
                )
            )

        # NOTE: When this method is called by, e.g. updated_titles(),
        # the large and small images are available separately. When
        # this method is called by get_item(), we only get a single
        # image, in 'cover'. In get_item() we ask that that image be 'large',
        # which means we'll be filing it as a normal-sized image.
        #
        full_image = None
        thumbnail_image = None
        for key, rel in (
            ("cover", Hyperlink.IMAGE),
            ("small_image", Hyperlink.THUMBNAIL_IMAGE),
            ("large_image", Hyperlink.IMAGE),
        ):
            url = element.get(key)
            if not url:
                continue
            link = LinkData(rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE)
            if rel == Hyperlink.THUMBNAIL_IMAGE:
                # Don't add a thumbnail to the list of links -- wait
                # until the end and then make it a thumbnail of the
                # primary image.
                thumbnail_image = link
            else:
                if rel == Hyperlink.IMAGE:
                    full_image = link
                links.append(link)

        if thumbnail_image:
            if full_image:
                # Set the thumbnail as the thumbnail _of_ the full image.
                full_image.thumbnail = thumbnail_image
            else:
                # Treat the thumbnail as the full image.
                thumbnail_image.rel = Hyperlink.IMAGE
                links.append(thumbnail_image)

        # We treat 'subject', 'topic', and 'genre' as interchangeable
        # sets of tags. This data is based on BISAC but it's not reliably
        # presented in a form that can be parsed as BISAC.
        subjects = []
        seen_topics = set()
        for key in ("subject", "topic", "genre"):
            for topic in element.get(key, []):
                if not topic or topic in seen_topics:
                    continue
                subjects.append(
                    SubjectData(
                        Subject.TAG,
                        topic,
                        weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT,
                    )
                )
                seen_topics.add(topic)

        language_code = element.get("language", "English")
        language = self.LANGUAGE_CODES.get(language_code, "eng")

        metadata = Metadata(
            data_source=DataSource.ENKI,
            title=element.get("title"),
            language=language,
            medium=Edition.BOOK_MEDIUM,
            publisher=element.get("publisher"),
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            contributors=contributors,
            links=links,
            subjects=subjects,
        )
        circulationdata = self.extract_circulation(
            primary_identifier,
            element.get("availability", {}),
            element.get("formattype", None),
        )
        metadata.circulation = circulationdata
        return metadata
Esempio n. 5
0
    def record_info_to_metadata(cls, book, availability):
        """Turn Odilo's JSON representation of a book into a Metadata
        object.

        Note:  The json data passed into this method is from a different file/stream
        from the json data that goes into the book_info_to_circulation() method.
        """
        if 'id' not in book:
            return None

        odilo_id = book['id']
        primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id)
        active = book.get('active')

        title = book.get('title')
        subtitle = book.get('subtitle')
        series = book.get('series')
        series_position = book.get('seriesPosition')

        contributors = []
        sort_author = book.get('author')
        if sort_author:
            roles = [Contributor.AUTHOR_ROLE]
            display_author = sort_name_to_display_name(sort_author)
            contributor = ContributorData(sort_name=sort_author,
                                          display_name=display_author,
                                          roles=roles,
                                          biography=None)
            contributors.append(contributor)

        publisher = book.get('publisher')

        # Metadata --> Marc21 260$c
        published = book.get('publicationDate')
        if not published:
            # yyyyMMdd --> record creation date
            published = book.get('releaseDate')

        if published:
            try:
                published = datetime.datetime.strptime(published, "%Y%m%d")
            except ValueError as e:
                cls.log.warn('Cannot parse publication date from: ' +
                             published + ', message: ' + e.message)

        # yyyyMMdd --> record last modification date
        last_update = book.get('modificationDate')
        if last_update:
            try:
                last_update = datetime.datetime.strptime(last_update, "%Y%m%d")
            except ValueError as e:
                cls.log.warn('Cannot parse last update date from: ' +
                             last_update + ', message: ' + e.message)

        language = book.get('language', 'spa')

        subjects = []
        for subject in book.get('subjects', []):
            subjects.append(
                SubjectData(type=Subject.TAG, identifier=subject, weight=100))

        for subjectBisacCode in book.get('subjectsBisacCodes', []):
            subjects.append(
                SubjectData(type=Subject.BISAC,
                            identifier=subjectBisacCode,
                            weight=100))

        grade_level = book.get('gradeLevel')
        if grade_level:
            subject = SubjectData(type=Subject.GRADE_LEVEL,
                                  identifier=grade_level,
                                  weight=10)
            subjects.append(subject)

        medium = None
        file_format = book.get('fileFormat')
        formats = []
        for format_received in book.get('formats', []):
            if format_received in cls.format_data_for_odilo_format:
                medium = cls.set_format(format_received, formats)
            elif format_received == cls.ACSM and file_format:
                medium = cls.set_format(
                    format_received + '_' + file_format.upper(), formats)
            else:
                cls.log.warn('Unrecognized format received: ' +
                             format_received)

        if not medium:
            medium = Edition.BOOK_MEDIUM

        identifiers = []
        isbn = book.get('isbn')
        if isbn:
            if isbnlib.is_isbn10(isbn):
                isbn = isbnlib.to_isbn13(isbn)
            identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1))

        # A cover
        links = []
        cover_image_url = book.get('coverImageUrl')
        if cover_image_url:
            image_data = cls.image_link_to_linkdata(cover_image_url,
                                                    Hyperlink.THUMBNAIL_IMAGE)
            if image_data:
                links.append(image_data)

        original_image_url = book.get('originalImageUrl')
        if original_image_url:
            image_data = cls.image_link_to_linkdata(original_image_url,
                                                    Hyperlink.IMAGE)
            if image_data:
                links.append(image_data)

        # Descriptions become links.
        description = book.get('description')
        if description:
            links.append(
                LinkData(rel=Hyperlink.DESCRIPTION,
                         content=description,
                         media_type="text/html"))

        metadata = Metadata(data_source=DataSource.ODILO,
                            title=title,
                            subtitle=subtitle,
                            language=language,
                            medium=medium,
                            series=series,
                            series_position=series_position,
                            publisher=publisher,
                            published=published,
                            primary_identifier=primary_identifier,
                            identifiers=identifiers,
                            subjects=subjects,
                            contributors=contributors,
                            links=links,
                            data_source_last_updated=last_update)

        metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation(
            availability)
        # 'active' --> means that the book exists but it's no longer in the collection
        # (it could be available again in the future)
        if not active:
            metadata.circulation.licenses_owned = 0
        metadata.circulation.formats = formats

        return metadata, active
Esempio n. 6
0
    def parse(cls, file, data_source_name, default_medium=None):
        metadata_records = []

        # TODO: ONIX has plain language 'reference names' and short tags that
        # may be used interchangably. This code currently only handles short tags,
        # and it's not comprehensive.

        parser = XMLParser()
        tree = etree.parse(file)
        root = tree.getroot()

        for record in root.findall("product"):
            title = parser.text_of_optional_subtag(
                record, "descriptivedetail/titledetail/titleelement/b203")
            if not title:
                title_prefix = parser.text_of_optional_subtag(
                    record, "descriptivedetail/titledetail/titleelement/b030")
                title_without_prefix = parser.text_of_optional_subtag(
                    record, "descriptivedetail/titledetail/titleelement/b031")
                if title_prefix and title_without_prefix:
                    title = title_prefix + " " + title_without_prefix

            medium = parser.text_of_optional_subtag(record, "b385")

            if not medium and default_medium:
                medium = default_medium
            else:
                medium = cls.PRODUCT_CONTENT_TYPES.get(
                    medium, EditionConstants.BOOK_MEDIUM)

            subtitle = parser.text_of_optional_subtag(
                record, "descriptivedetail/titledetail/titleelement/b029")
            language = (parser.text_of_optional_subtag(
                record, "descriptivedetail/language/b252") or "eng")
            publisher = parser.text_of_optional_subtag(
                record, "publishingdetail/publisher/b081")
            imprint = parser.text_of_optional_subtag(
                record, "publishingdetail/imprint/b079")
            if imprint == publisher:
                imprint = None

            publishing_date = parser.text_of_optional_subtag(
                record, "publishingdetail/publishingdate/b306")
            issued = None
            if publishing_date:
                issued = dateutil.parser.isoparse(publishing_date)
                if issued.tzinfo is None:
                    cls._logger.warning(
                        "Publishing date {} does not contain timezone information. Assuming UTC."
                        .format(publishing_date))
                issued = to_utc(issued)

            identifier_tags = parser._xpath(record, "productidentifier")
            identifiers = []
            primary_identifier = None
            for tag in identifier_tags:
                type = parser.text_of_subtag(tag, "b221")
                if type == "02" or type == "15":
                    primary_identifier = IdentifierData(
                        Identifier.ISBN, parser.text_of_subtag(tag, "b244"))
                    identifiers.append(primary_identifier)

            subject_tags = parser._xpath(record, "descriptivedetail/subject")
            subjects = []

            weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT
            for tag in subject_tags:
                type = parser.text_of_subtag(tag, "b067")
                if type in cls.SUBJECT_TYPES:
                    b069 = parser.text_of_optional_subtag(tag, "b069")

                    if b069:
                        subjects.append(
                            SubjectData(cls.SUBJECT_TYPES[type],
                                        b069,
                                        weight=weight))

            audience_tags = parser._xpath(record,
                                          "descriptivedetail/audience/b204")
            audiences = []
            for tag in audience_tags:
                if tag.text in cls.AUDIENCE_TYPES:
                    subjects.append(
                        SubjectData(
                            Subject.FREEFORM_AUDIENCE,
                            cls.AUDIENCE_TYPES[tag.text],
                            weight=weight,
                        ))

            # TODO: We don't handle ONIX unnamed and alternatively named contributors.
            contributor_tags = parser._xpath(record,
                                             "descriptivedetail/contributor")
            contributors = []
            for tag in contributor_tags:
                type = parser.text_of_subtag(tag, "b035")
                if type in cls.CONTRIBUTOR_TYPES:
                    person_name_display = parser.text_of_optional_subtag(
                        tag, "b036")
                    person_name_inverted = parser.text_of_optional_subtag(
                        tag, "b037")
                    corp_name_display = parser.text_of_optional_subtag(
                        tag, "b047")
                    corp_name_inverted = parser.text_of_optional_subtag(
                        tag, "x443")
                    bio = parser.text_of_optional_subtag(tag, "b044")
                    family_name = None
                    if person_name_display or person_name_inverted:
                        display_name = person_name_display
                        sort_name = person_name_inverted
                        family_name = parser.text_of_optional_subtag(
                            tag, "b040")
                    elif corp_name_display or corp_name_inverted:
                        display_name = corp_name_display
                        # Sort form for corporate name might just be the display name
                        sort_name = corp_name_inverted or corp_name_display
                    else:
                        sort_name = display_name = None
                    contributors.append(
                        ContributorData(
                            sort_name=sort_name,
                            display_name=display_name,
                            family_name=family_name,
                            roles=[cls.CONTRIBUTOR_TYPES[type]],
                            biography=bio,
                        ))

            collateral_tags = parser._xpath(record,
                                            "collateraldetail/textcontent")
            links = []
            for tag in collateral_tags:
                type = parser.text_of_subtag(tag, "x426")
                # TODO: '03' is the summary in the example I'm testing, but that
                # might not be generally true.
                if type == "03":
                    text = parser.text_of_subtag(tag, "d104")
                    links.append(
                        LinkData(
                            rel=Hyperlink.DESCRIPTION,
                            media_type=Representation.TEXT_HTML_MEDIA_TYPE,
                            content=text,
                        ))

            usage_constraint_tags = parser._xpath(
                record, "descriptivedetail/epubusageconstraint")
            licenses_owned = LicensePool.UNLIMITED_ACCESS

            if usage_constraint_tags:
                cls._logger.debug("Found {0} EpubUsageConstraint tags".format(
                    len(usage_constraint_tags)))

            for usage_constraint_tag in usage_constraint_tags:
                usage_status = parser.text_of_subtag(usage_constraint_tag,
                                                     "x319")

                cls._logger.debug("EpubUsageStatus: {0}".format(usage_status))

                if usage_status == UsageStatus.PROHIBITED.value:
                    raise Exception("The content is prohibited")
                elif usage_status == UsageStatus.LIMITED.value:
                    usage_limit_tags = parser._xpath(
                        record,
                        "descriptivedetail/epubusageconstraint/epubusagelimit")

                    cls._logger.debug("Found {0} EpubUsageLimit tags".format(
                        len(usage_limit_tags)))

                    if not usage_limit_tags:
                        continue

                    [usage_limit_tag] = usage_limit_tags

                    usage_unit = parser.text_of_subtag(usage_limit_tag, "x321")

                    cls._logger.debug("EpubUsageUnit: {0}".format(usage_unit))

                    if (usage_unit == UsageUnit.COPIES.value or usage_status
                            == UsageUnit.CONCURRENT_USERS.value):
                        quantity_limit = parser.text_of_subtag(
                            usage_limit_tag, "x320")

                        cls._logger.debug(
                            "Quantity: {0}".format(quantity_limit))

                        if licenses_owned == LicensePool.UNLIMITED_ACCESS:
                            licenses_owned = 0

                        licenses_owned += int(quantity_limit)

            metadata_records.append(
                Metadata(
                    data_source=data_source_name,
                    title=title,
                    subtitle=subtitle,
                    language=language,
                    medium=medium,
                    publisher=publisher,
                    imprint=imprint,
                    issued=issued,
                    primary_identifier=primary_identifier,
                    identifiers=identifiers,
                    subjects=subjects,
                    contributors=contributors,
                    links=links,
                    circulation=CirculationData(
                        data_source_name,
                        primary_identifier,
                        licenses_owned=licenses_owned,
                        licenses_available=licenses_owned,
                        licenses_reserved=0,
                        patrons_in_hold_queue=0,
                    ),
                ))

        return metadata_records
Esempio n. 7
0
    def parse(cls, file, data_source_name):
        metadata_records = []

        # TODO: ONIX has plain language 'reference names' and short tags that
        # may be used interchangably. This code currently only handles short tags,
        # and it's not comprehensive.

        parser = XMLParser()
        tree = etree.parse(file)
        root = tree.getroot()

        for record in root.findall('product'):
            title = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b203')
            if not title:
                title_prefix = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b030')
                title_without_prefix = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b031')
                if title_prefix and title_without_prefix:
                    title = title_prefix + " " + title_without_prefix

            subtitle = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b029')
            language = parser.text_of_optional_subtag(record, 'descriptivedetail/language/b252') or "eng"
            publisher = parser.text_of_optional_subtag(record, 'publishingdetail/publisher/b081')
            imprint = parser.text_of_optional_subtag(record, 'publishingdetail/imprint/b079')
            if imprint == publisher:
                imprint = None

            publishing_date = parser.text_of_optional_subtag(record, 'publishingdetail/publishingdate/b306')
            issued = None
            if publishing_date:
                issued = datetime.datetime.strptime(publishing_date, "%Y%m%d")

            identifier_tags = parser._xpath(record, 'productidentifier')
            identifiers = []
            primary_identifier = None
            for tag in identifier_tags:
                type = parser.text_of_subtag(tag, "b221")
                if type == '02' or type == '15':
                    primary_identifier = IdentifierData(Identifier.ISBN, parser.text_of_subtag(tag, 'b244'))
                    identifiers.append(primary_identifier)

            subject_tags = parser._xpath(record, 'descriptivedetail/subject')
            subjects = []

            weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT
            for tag in subject_tags:
                type = parser.text_of_subtag(tag, 'b067')
                if type in cls.SUBJECT_TYPES:
                    subjects.append(
                        SubjectData(
                            cls.SUBJECT_TYPES[type],
                            parser.text_of_subtag(tag, 'b069'),
                            weight=weight
                        )
                    )

            audience_tags = parser._xpath(record, 'descriptivedetail/audience/b204')
            audiences = []
            for tag in audience_tags:
                if tag.text in cls.AUDIENCE_TYPES:
                    subjects.append(
                        SubjectData(
                            Subject.FREEFORM_AUDIENCE,
                            cls.AUDIENCE_TYPES[tag.text],
                            weight=weight
                        )
                    )

            contributor_tags = parser._xpath(record, 'descriptivedetail/contributor')
            contributors = []
            for tag in contributor_tags:
                type = parser.text_of_subtag(tag, 'b035')
                if type in cls.CONTRIBUTOR_TYPES:
                    display_name = parser.text_of_subtag(tag, 'b036')
                    sort_name = parser.text_of_optional_subtag(tag, 'b037')
                    family_name = parser.text_of_optional_subtag(tag, 'b040')
                    bio = parser.text_of_optional_subtag(tag, 'b044')
                    contributors.append(ContributorData(sort_name=sort_name,
                                                        display_name=display_name,
                                                        family_name=family_name,
                                                        roles=[cls.CONTRIBUTOR_TYPES[type]],
                                                        biography=bio))

            collateral_tags = parser._xpath(record, 'collateraldetail/textcontent')
            links = []
            for tag in collateral_tags:
                type = parser.text_of_subtag(tag, 'x426')
                # TODO: '03' is the summary in the example I'm testing, but that
                # might not be generally true.
                if type == '03':
                    text = parser.text_of_subtag(tag, 'd104')
                    links.append(LinkData(rel=Hyperlink.DESCRIPTION,
                                          media_type=Representation.TEXT_HTML_MEDIA_TYPE,
                                          content=text))

            metadata_records.append(Metadata(
                data_source=data_source_name,
                title=title,
                subtitle=subtitle,
                language=language,
                medium=Edition.BOOK_MEDIUM,
                publisher=publisher,
                imprint=imprint,
                issued=issued,
                primary_identifier=primary_identifier,
                identifiers=identifiers,
                subjects=subjects,
                contributors=contributors,
                links=links
            ))
        return metadata_records
Esempio n. 8
0
    def __init__(self, product):
        self.subjects = []
        self.identifiers = []
        self.contributors = []
        self.links = []
        self.product = product
        self.var = defaultdict(list)
        self.unrecognized_tags = dict()
        self.title = None
        for f in self.product.get('varFields', []):
            marctag = MarcTag(f)
            self.var[marctag.marcTag].append(marctag)

        # Find a title.
        for num in ('245', '240'):
            for tag in self.tags(num):
                self.title = tag.a
                if self.title:
                    break
            if self.title:
                break

        # Contributors
        for tag in self.tags('100'):
            role = tag.e or 'author.'
            sort_name = tag.a
            self.contributors.append(
                ContributorData(sort_name=sort_name, roles=[role]))

        # Subjects
        for number in ('050', '908'):
            for tag in self.tags(number):
                # Library of Congress classification
                if tag.a:
                    self.subjects.append(
                        SubjectData(type=Subject.LCC, identifier=tag.a))
                # TODO: tag.b ("Pap 2014eb") includes potentially useful
                # date information.

        for tag in self.tags('856'):
            if tag.subfields.get('3', {}).get('content') == 'Image':
                continue
            if tag.u:
                if tag.y == 'Access eNYPL' or tag.z == 'Access eNYPL':
                    self.links.append(LinkData(rel='alternate', href=tag.u))

        for tag in self.tags('082'):
            if tag.a:
                self.subjects.append(
                    SubjectData(type=Subject.DDC, identifier=tag.a))

        for v in range(650, 656):
            for tag in self.tags(v):
                type = getattr(tag, '2', None)
                native_type = Subject.TAG
                if type:
                    if type.endswith('.'):
                        type = type[:-1]
                    Representation.tag_type[type] += 1
                    native_type = self.shadowcat_subject_type_to_native_type.get(
                        type, Subject.TAG)

                identifiers = [x for x in [tag.a, tag.v] if x]
                for identifier in identifiers:
                    self.subjects.append(
                        SubjectData(type=native_type, identifier=identifier))

        # Identifiers
        for tag in self.tags('037'):
            if tag.a and (tag.b in self.marc_037_b_to_identifier_type):
                t = self.marc_037_b_to_identifier_type[tag.b]
                self.identifiers.append(
                    IdentifierData(type=t, identifier=tag.a))

        for tag in self.tags('020'):
            isbn = tag.a
            if not isbn:
                continue
            for r in self.isbn_res:
                m = r.search(isbn)
                if m:
                    isbn = m.groups()[0]
                    self.identifiers.append(
                        IdentifierData(type=Identifier.ISBN, identifier=isbn))

        for key in ['385', '521']:
            for tag in self.tags(key):
                identifier = tag.a
                if identifier.lower() in self.audience_blacklist:
                    continue
                self.subjects.append(
                    SubjectData(type=Subject.FREEFORM_AUDIENCE,
                                identifier=identifier))

        for tag in self.tags('035'):
            potential = tag.a
            identifier = None
            for r, type in self.marc_035_a_to_identifier_type.items():
                m = r.search(potential)
                if m:
                    identifier = m.groups()[0]
                    break
            if identifier:
                self.identifiers.append(
                    IdentifierData(type=type, identifier=identifier))

        # Keep track of items we haven't seen before.
        for key, var in self.var.items():
            if key not in self.known_vars:
                self.unrecognized_tags[key] = var
Esempio n. 9
0
    def parse(cls, file, data_source_name):
        metadata_records = []

        # TODO: ONIX has plain language 'reference names' and short tags that
        # may be used interchangably. This code currently only handles short tags,
        # and it's not comprehensive.

        parser = XMLParser()
        tree = etree.parse(file)
        root = tree.getroot()

        for record in root.findall('product'):
            title = parser.text_of_optional_subtag(
                record, 'descriptivedetail/titledetail/titleelement/b203')
            if not title:
                title_prefix = parser.text_of_optional_subtag(
                    record, 'descriptivedetail/titledetail/titleelement/b030')
                title_without_prefix = parser.text_of_optional_subtag(
                    record, 'descriptivedetail/titledetail/titleelement/b031')
                if title_prefix and title_without_prefix:
                    title = title_prefix + " " + title_without_prefix

            subtitle = parser.text_of_optional_subtag(
                record, 'descriptivedetail/titledetail/titleelement/b029')
            language = parser.text_of_optional_subtag(
                record, 'descriptivedetail/language/b252') or "eng"
            publisher = parser.text_of_optional_subtag(
                record, 'publishingdetail/publisher/b081')
            imprint = parser.text_of_optional_subtag(
                record, 'publishingdetail/imprint/b079')
            if imprint == publisher:
                imprint = None

            publishing_date = parser.text_of_optional_subtag(
                record, 'publishingdetail/publishingdate/b306')
            issued = None
            if publishing_date:
                issued = datetime.datetime.strptime(publishing_date, "%Y%m%d")

            identifier_tags = parser._xpath(record, 'productidentifier')
            identifiers = []
            primary_identifier = None
            for tag in identifier_tags:
                type = parser.text_of_subtag(tag, "b221")
                if type == '02' or type == '15':
                    primary_identifier = IdentifierData(
                        Identifier.ISBN, parser.text_of_subtag(tag, 'b244'))
                    identifiers.append(primary_identifier)

            subject_tags = parser._xpath(record, 'descriptivedetail/subject')
            subjects = []

            weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT
            for tag in subject_tags:
                type = parser.text_of_subtag(tag, 'b067')
                if type in cls.SUBJECT_TYPES:
                    subjects.append(
                        SubjectData(cls.SUBJECT_TYPES[type],
                                    parser.text_of_subtag(tag, 'b069'),
                                    weight=weight))

            audience_tags = parser._xpath(record,
                                          'descriptivedetail/audience/b204')
            audiences = []
            for tag in audience_tags:
                if tag.text in cls.AUDIENCE_TYPES:
                    subjects.append(
                        SubjectData(Subject.FREEFORM_AUDIENCE,
                                    cls.AUDIENCE_TYPES[tag.text],
                                    weight=weight))

            contributor_tags = parser._xpath(record,
                                             'descriptivedetail/contributor')
            contributors = []
            for tag in contributor_tags:
                type = parser.text_of_subtag(tag, 'b035')
                if type in cls.CONTRIBUTOR_TYPES:
                    display_name = parser.text_of_subtag(tag, 'b036')
                    sort_name = parser.text_of_optional_subtag(tag, 'b037')
                    family_name = parser.text_of_optional_subtag(tag, 'b040')
                    bio = parser.text_of_optional_subtag(tag, 'b044')
                    contributors.append(
                        ContributorData(sort_name=sort_name,
                                        display_name=display_name,
                                        family_name=family_name,
                                        roles=[cls.CONTRIBUTOR_TYPES[type]],
                                        biography=bio))

            collateral_tags = parser._xpath(record,
                                            'collateraldetail/textcontent')
            links = []
            for tag in collateral_tags:
                type = parser.text_of_subtag(tag, 'x426')
                # TODO: '03' is the summary in the example I'm testing, but that
                # might not be generally true.
                if type == '03':
                    text = parser.text_of_subtag(tag, 'd104')
                    links.append(
                        LinkData(
                            rel=Hyperlink.DESCRIPTION,
                            media_type=Representation.TEXT_HTML_MEDIA_TYPE,
                            content=text))

            usage_constraint_tags = parser._xpath(
                record, 'descriptivedetail/epubusageconstraint')
            licenses_owned = LicensePool.UNLIMITED_ACCESS

            if usage_constraint_tags:
                cls._logger.debug('Found {0} EpubUsageConstraint tags'.format(
                    len(usage_constraint_tags)))

            for usage_constraint_tag in usage_constraint_tags:
                usage_status = parser.text_of_subtag(usage_constraint_tag,
                                                     'x319')

                cls._logger.debug('EpubUsageStatus: {0}'.format(usage_status))

                if usage_status == UsageStatus.PROHIBITED.value:
                    raise Exception('The content is prohibited')
                elif usage_status == UsageStatus.LIMITED.value:
                    usage_limit_tags = parser._xpath(
                        record,
                        'descriptivedetail/epubusageconstraint/epubusagelimit')

                    cls._logger.debug('Found {0} EpubUsageLimit tags'.format(
                        len(usage_limit_tags)))

                    if not usage_limit_tags:
                        continue

                    [usage_limit_tag] = usage_limit_tags

                    usage_unit = parser.text_of_subtag(usage_limit_tag, 'x321')

                    cls._logger.debug('EpubUsageUnit: {0}'.format(usage_unit))

                    if usage_unit == UsageUnit.COPIES.value or usage_status == UsageUnit.CONCURRENT_USERS.value:
                        quantity_limit = parser.text_of_subtag(
                            usage_limit_tag, 'x320')

                        cls._logger.debug(
                            'Quantity: {0}'.format(quantity_limit))

                        if licenses_owned == LicensePool.UNLIMITED_ACCESS:
                            licenses_owned = 0

                        licenses_owned += int(quantity_limit)

            metadata_records.append(
                Metadata(data_source=data_source_name,
                         title=title,
                         subtitle=subtitle,
                         language=language,
                         medium=Edition.BOOK_MEDIUM,
                         publisher=publisher,
                         imprint=imprint,
                         issued=issued,
                         primary_identifier=primary_identifier,
                         identifiers=identifiers,
                         subjects=subjects,
                         contributors=contributors,
                         links=links,
                         circulation=CirculationData(
                             data_source_name,
                             primary_identifier,
                             licenses_owned=licenses_owned,
                             licenses_available=licenses_owned,
                             licenses_reserved=0,
                             patrons_in_hold_queue=0)))

        return metadata_records
Esempio n. 10
0
                    elif len(isbn) != 13:
                        continue
                    if isbn:
                        metadata.identifiers.append(
                            IdentifierData(type=Identifier.ISBN,
                                           identifier=isbn))

        for subject_type, subjects_details in subjects.items():
            for subject_detail in subjects_details:
                if isinstance(subject_detail, dict):
                    subject_name = subject_detail.get('name')
                    subject_identifier = subject_detail.get('id')
                    metadata.subjects.append(
                        SubjectData(
                            type=subject_type,
                            identifier=subject_identifier,
                            name=subject_name,
                        ))
                else:
                    metadata.subjects.append(
                        SubjectData(type=subject_type,
                                    identifier=subject_detail))

        viafs = [self.VIAF_ID.search(uri) for uri in creator_uris]
        viafs = [viaf.groups()[0] for viaf in viafs if viaf is not None]
        for viaf in viafs:
            metadata.contributors.append(ContributorData(viaf=viaf))

        if creator_uris and not viafs:
            # We vastly prefer VIAF author information over OCLC.
            # We'll only extract OCLC author information if we have
Esempio n. 11
0
    def parse(cls, file, data_source_name):
        reader = MARCReader(file)
        metadata_records = []

        for record in reader:
            title = record.title()
            if title.endswith(' /'):
                title = title[:-len(' /')]
            issued_year = datetime.datetime.strptime(record.pubyear(), "%Y.")
            publisher = record.publisher()
            if publisher.endswith(','):
                publisher = publisher[:-1]

            links = []
            summary = record.notes()[0]['a']

            if summary:
                summary_link = LinkData(
                    rel=Hyperlink.DESCRIPTION,
                    media_type=Representation.TEXT_PLAIN,
                    content=summary,
                )
                links.append(summary_link)

            isbn = record['020']['a'].split(" ")[0]
            primary_identifier = IdentifierData(Identifier.ISBN, isbn)

            subjects = [
                SubjectData(
                    Classifier.FAST,
                    subject['a'],
                ) for subject in record.subjects()
            ]

            author = record.author()
            if author:
                old_author = author
                # Turn 'Dante Alighieri,   1265-1321, author.'
                # into 'Dante Alighieri'. The metadata wrangler will
                # take it from there.
                for regex in cls.END_OF_AUTHOR_NAME_RES:
                    match = regex.search(author)
                    if match:
                        old_author = author
                        author = author[:match.start()]
                        break
                author_names = [author]
            else:
                author_names = ['Anonymous']
            contributors = [
                ContributorData(
                    sort_name=author,
                    roles=[Contributor.AUTHOR_ROLE],
                ) for author in author_names
            ]

            metadata_records.append(
                Metadata(data_source=data_source_name,
                         title=title,
                         language='eng',
                         medium=Edition.BOOK_MEDIUM,
                         publisher=publisher,
                         issued=issued_year,
                         primary_identifier=primary_identifier,
                         subjects=subjects,
                         contributors=contributors,
                         links=links))
        return metadata_records