Exemple #1
0
    def lookup(self, identifier):
        """Requests NoveList metadata for a particular identifier

        :return: Metadata object or None
        """

        client_identifier = identifier.urn
        if identifier.type != Identifier.ISBN:
            return self.lookup_equivalent_isbns(identifier)

        params = dict(ClientIdentifier=client_identifier,
                      ISBN=identifier.identifier,
                      version=self.version,
                      profile=self.profile,
                      password=self.password)
        url = self._build_query(params)
        self.log.debug("NoveList lookup: %s", url)
        representation, from_cache = Representation.cacheable_post(
            self._db,
            unicode(url),
            params,
            max_age=self.MAX_REPRESENTATION_AGE,
            response_reviewer=self.review_response)

        return self.lookup_info_to_metadata(representation)
 def _check_for_gutenberg_first(self, url, headers, **kwargs):
     """Make a HEAD request for the given URL to make sure
     it doesn't redirect to gutenberg.org.
     """
     parsed = urlparse.urlparse(url)
     if parsed.netloc.endswith('unglue.it'):
         # It might be a redirect. Make a HEAD request to see where
         # it leads.
         head_response = requests.head(url, headers=headers)
         if head_response.status_code / 100 == 3:
             # Yes, it's a redirect.
             location = head_response.headers.get('location')
             if location:
                 parsed = urlparse.urlparse(location)
                 if parsed.netloc.endswith('gutenberg.org'):
                     # If we make this request we're going to be in
                     # for some trouble, and we won't even get
                     # anything useful. Act as though we got an
                     # unappetizing representation.
                     self.log.info("Not making request to gutenberg.org.")
                     return (
                         200, 
                         {"content-type" :
                          "application/vnd.librarysimplified-clickthrough"},
                         "Gated behind Gutenberg click-through"
                     )
     return Representation.simple_http_get(url, headers, **kwargs)
Exemple #3
0
    def request(self, path, identifier=None, max_age=LIST_MAX_AGE):
        if not path.startswith(self.BASE_URL):
            if not path.startswith("/"):
                path = "/" + path
            url = self.BASE_URL + path
        else:
            url = path
        joiner = '?'
        if '?' in url:
            joiner = '&'
        url += joiner + "api-key=" + self.api_key
        representation, cached = Representation.get(
            self._db, url, do_get=self.do_get, max_age=max_age, debug=True,
            pause_before=0.1)
        status = representation.status_code
        if status == 200:
            # Everything's fine.
            content = json.loads(representation.content)
            return content

        diagnostic = "Response from %s was: %r" % (
            url, representation.content
        )

        if status == 403:
            raise IntegrationException(
                "API authentication failed",
                "API key is most likely wrong. %s" % diagnostic
            )
        else:
            raise IntegrationException(
                "Unknown API error (status %s)" % status, diagnostic
            )
Exemple #4
0
    def request(self, path, identifier=None, max_age=LIST_MAX_AGE):
        if not path.startswith(self.BASE_URL):
            if not path.startswith("/"):
                path = "/" + path
            url = self.BASE_URL + path
        else:
            url = path
        joiner = '?'
        if '?' in url:
            joiner = '&'
        url += joiner + "api-key=" + self.api_key
        representation, cached = Representation.get(self._db,
                                                    url,
                                                    do_get=self.do_get,
                                                    max_age=max_age,
                                                    debug=True,
                                                    pause_before=0.1)
        status = representation.status_code
        if status == 200:
            # Everything's fine.
            content = json.loads(representation.content)
            return content

        diagnostic = "Response from %s was: %r" % (url, representation.content)

        if status == 403:
            raise IntegrationException(
                "API authentication failed",
                "API key is most likely wrong. %s" % diagnostic)
        else:
            raise IntegrationException(
                "Unknown API error (status %s)" % status, diagnostic)
    def lookup(self, identifier):
        """Requests NoveList metadata for a particular identifier

        :return: Metadata object or None
        """
        client_identifier = identifier.urn
        if identifier.type != Identifier.ISBN:
            return self.lookup_equivalent_isbns(identifier)

        params = dict(
            ClientIdentifier=client_identifier, ISBN=identifier.identifier,
            version=self.version, profile=self.profile, password=self.password
        )
        scrubbed_url = unicode(self.scrubbed_url(params))

        representation = self.cached_representation(scrubbed_url)
        if not representation:
            self.log.info("No cached NoveList request available.")

            url = self.build_query_url(params)
            self.log.debug("NoveList lookup: %s",  url)
            representation, from_cache = Representation.post(
                self._db, unicode(url), '', max_age=self.MAX_REPRESENTATION_AGE,
                response_reviewer=self.review_response
            )

            # Remove credential information from the Representation URL. This
            # avoids holding those details in an unexpected part of the database
            # and lets multiple libraries to use the same cached representation.
            representation.url = scrubbed_url

        return self.lookup_info_to_metadata(representation)
Exemple #6
0
    def lookup(self, identifier):
        """Requests NoveList metadata for a particular identifier

        :return: Metadata object or None
        """
        client_identifier = identifier.urn
        if identifier.type != Identifier.ISBN:
            return self.lookup_equivalent_isbns(identifier)

        params = dict(ClientIdentifier=client_identifier,
                      ISBN=identifier.identifier,
                      version=self.version,
                      profile=self.profile,
                      password=self.password)
        scrubbed_url = unicode(self.scrubbed_url(params))

        representation = self.cached_representation(scrubbed_url)
        if not representation:
            self.log.info("No cached NoveList request available.")

            url = self.build_query_url(params)
            self.log.debug("NoveList lookup: %s", url)
            representation, from_cache = Representation.post(
                self._db,
                unicode(url),
                '',
                max_age=self.MAX_REPRESENTATION_AGE,
                response_reviewer=self.review_response)

            # Remove credential information from the Representation URL. This
            # avoids holding those details in an unexpected part of the database
            # and lets multiple libraries to use the same cached representation.
            representation.url = scrubbed_url

        return self.lookup_info_to_metadata(representation)
    def lookup_by_viaf(self, viaf, working_sort_name=None,
                       working_display_name=None, do_get=None):
        url = self.LOOKUP_URL % dict(viaf=viaf)
        r, cached = Representation.get(
            self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
        )

        xml = r.content
        return self.parser.parse(xml, working_sort_name, working_display_name)
    def lookup_by_name(self, sort_name, display_name=None, do_get=None,
                       known_titles=None):
        """
        Asks VIAF for a list of author clusters, matching the passed-in
        author name.  Selects the cluster we deem the best match for
        the author we mean.

        :param sort_name: Author name in Last, First format.
        :param display_name: Author name in First Last format.
        :param do_get: Ask Representation to use Http GET?
        :param known_titles: A list of titles we know this author wrote.
        :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData.
        """
        author_name = sort_name or display_name
        # from OCLC tech support:
        # VIAF's SRU endpoint can only return a maximum number of 10 records
        # when the recordSchema is http://viaf.org/VIAFCluster
        maximum_records = 10 # viaf maximum that's not ignored
        page = 1
        contributor_candidates = []

        # limit ourselves to reading the first 500 viaf clusters, on the
        # assumption that search match quality is unlikely to be usable after that.
        for page in range (1, 51):
            start_record = 1 + maximum_records * (page-1)
            scope = 'local.personalNames'
            if is_corporate_name(author_name):
                scope = 'local.corporateNames'

            url = self.SEARCH_URL.format(
                scope=scope, author_name=author_name.encode("utf8"),
                maximum_records=maximum_records, start_record=start_record
            )
            representation, cached = Representation.get(
                self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
            )
            xml = representation.content

            candidates = self.parser.parse_multiple(xml, sort_name, display_name, page)
            if not any(candidates):
                # Delete the representation so it's not cached.
                self._db.query(Representation).filter(
                    Representation.id==representation.id
                ).delete()
                # We ran out of clusters, so we can relax and move on to
                # ordering the returned results
                break

            contributor_candidates.extend(candidates)
            page += 1

        best_match = self.select_best_match(candidates=contributor_candidates,
            working_sort_name=author_name, known_titles=known_titles)

        return best_match
    def get_jsonld(self, url):
        representation, cached = Representation.get(self._db, url)
        try:
            data = jsonld.load_document(url)
        except Exception as e:
            self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e)
            return None, False

        if cached and not representation.content:
            representation, cached = Representation.get(
                self._db, url, max_age=0)

        if not representation.content:
            return None, False
        
        doc = {
            'contextUrl': None,
            'documentUrl': url,
            'document': representation.content.decode('utf8')
        }
        return doc, cached
    def lookup_name_title(self, viaf, do_get=None):
        url = self.LOOKUP_URL % dict(viaf=viaf)
        r, cached = Representation.get(
            self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
        )

        xml = r.content
        cluster = etree.fromstring(xml, parser=etree.XMLParser(recover=True))

        titles = []
        for potential_title in self.parser.name_titles_for_cluster(cluster):
            titles.append(potential_title)
        return titles
Exemple #11
0
    def get_jsonld(self, url):
        representation, cached = Representation.get(self._db, url)
        try:
            data = jsonld.load_document(url)
        except Exception as e:
            self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e)
            return None, False

        if cached and not representation.content:
            representation, cached = Representation.get(self._db,
                                                        url,
                                                        max_age=0)

        if not representation.content:
            return None, False

        doc = {
            'contextUrl': None,
            'documentUrl': url,
            'document': representation.content.decode('utf8')
        }
        return doc, cached
Exemple #12
0
    def lookup_by_viaf(self,
                       viaf,
                       working_sort_name=None,
                       working_display_name=None,
                       do_get=None):
        url = self.LOOKUP_URL % dict(viaf=viaf)
        r, cached = Representation.get(self._db,
                                       url,
                                       do_get=do_get,
                                       max_age=self.REPRESENTATION_MAX_AGE)

        xml = r.content
        return self.parser.parse(xml, working_sort_name, working_display_name)
Exemple #13
0
    def lookup_name_title(self, viaf, do_get=None):
        url = self.LOOKUP_URL % dict(viaf=viaf)
        r, cached = Representation.get(self._db,
                                       url,
                                       do_get=do_get,
                                       max_age=self.REPRESENTATION_MAX_AGE)

        xml = r.content
        cluster = etree.fromstring(xml, parser=etree.XMLParser(recover=True))

        titles = []
        for potential_title in self.parser.name_titles_for_cluster(cluster):
            titles.append(potential_title)
        return titles
Exemple #14
0
    def __init__(self, content_dict, **kwargs):
        super(AudiobookManifest, self).__init__(**kwargs)
        self.raw = content_dict

        # Metadata values that map directly onto the core spec.
        self.import_metadata('title')
        self.import_metadata('publisher')
        self.import_metadata('description')
        self.import_metadata('isbn', 'identifier')
        self.import_metadata('authors', 'author')
        self.import_metadata('narrators', 'narrator')
        self.import_metadata('minutes', 'duration', lambda x: x*60)

        # Metadata values that have no equivalent in the core spec,
        # but are potentially useful.
        self.import_metadata('size', 'schema:contentSize')
        self.import_metadata('titleid', 'rbdigital:id', str)
        self.import_metadata('hasDrm', 'rbdigital:hasDrm')
        self.import_metadata('encryptionKey', 'rbdigital:encryptionKey')

        # Spine items.
        for file_data in self.raw.get('files', []):
            self.import_spine(file_data)

        # Links.
        download_url = self.raw.get('downloadUrl')
        if download_url:
            self.add_link(
                download_url, 'alternate', 
                type=Representation.guess_media_type(download_url)
            )

        cover = self.best_cover(self.raw.get('images', []))
        if cover:
            self.add_link(
                cover, "cover", type=Representation.guess_media_type(cover)
            )
Exemple #15
0
 def request(self, path, identifier=None, max_age=LIST_MAX_AGE):
     if not path.startswith(self.BASE_URL):
         if not path.startswith("/"):
             path = "/" + path
         url = self.BASE_URL + path
     else:
         url = path
     joiner = '?'
     if '?' in url:
         joiner = '&'
     url += joiner + "api-key=" + self.api_key
     representation, cached = Representation.get(
         self._db, url, do_get=self.do_get, max_age=max_age, debug=True,
         pause_before=0.1)
     content = json.loads(representation.content)
     return content
Exemple #16
0
    def lookup(self, identifier, **kwargs):
        """Requests NoveList metadata for a particular identifier

        :param kwargs: Keyword arguments passed into Representation.post().

        :return: Metadata object or None
        """
        client_identifier = identifier.urn
        if identifier.type != Identifier.ISBN:
            return self.lookup_equivalent_isbns(identifier)

        params = dict(
            ClientIdentifier=client_identifier,
            ISBN=identifier.identifier,
            version=self.version,
            profile=self.profile,
            password=self.password,
        )
        scrubbed_url = str(self.scrubbed_url(params))

        url = self.build_query_url(params)
        self.log.debug("NoveList lookup: %s", url)

        # We want to make an HTTP request for `url` but cache the
        # result under `scrubbed_url`. Define a 'URL normalization'
        # function that always returns `scrubbed_url`.
        def normalized_url(original):
            return scrubbed_url

        representation, from_cache = Representation.post(
            _db=self._db,
            url=str(url),
            data="",
            max_age=self.MAX_REPRESENTATION_AGE,
            response_reviewer=self.review_response,
            url_normalizer=normalized_url,
            **kwargs
        )

        # Commit to the database immediately to reduce the chance
        # that some other incoming request will try to create a
        # duplicate Representation and crash.
        self._db.commit()

        return self.lookup_info_to_metadata(representation)
Exemple #17
0
    def oclc_number_for_isbn(self, isbn):
        """Turn an ISBN identifier into an OCLC Number identifier."""
        url = self.ISBN_BASE_URL % dict(id=isbn.identifier)
        representation, cached = Representation.get(
            self._db, url, Representation.http_get_no_redirect)
        if not representation.location:
            raise IOError(
                "Expected %s to redirect, but couldn't find location." % url)

        location = representation.location
        match = self.URI_WITH_OCLC_NUMBER.match(location)
        if not match:
            raise IOError(
                "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s"
                % location)
        oclc_number = match.groups()[0]
        return Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER,
                                         oclc_number)[0]
Exemple #18
0
    def oclc_number_for_isbn(self, isbn):
        """Turn an ISBN identifier into an OCLC Number identifier."""
        url = self.ISBN_BASE_URL % dict(id=isbn.identifier)
        representation, cached = Representation.get(
            self._db, url, Representation.http_get_no_redirect)
        if not representation.location:
            raise IOError(
                "Expected %s to redirect, but couldn't find location." % url
            )

        location = representation.location
        match = self.URI_WITH_OCLC_NUMBER.match(location)
        if not match:
            raise IOError(
                "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s" % location)
        oclc_number = match.groups()[0]
        return Identifier.for_foreign_id(
            self._db, Identifier.OCLC_NUMBER, oclc_number)[0]
Exemple #19
0
    def mirror_hyperlink(self, hyperlink):
        resource = hyperlink.resource
        if not resource.representation:
            resource.representation, cached = Representation.get(
                self._db, resource.url, max_age=self.ONE_YEAR)
            representation = resource.representation
            if not representation.media_type or not representation.media_type.startswith('image/'):
                representation.fetch_exception = (
                    'Representation is not an image as expected.')
                return representation

            extension = self.image_extensions_for_types.get(
                representation.media_type, '')
            filename = "cover" + extension
            representation.mirror_url = self.uploader.cover_image_url(
                hyperlink.data_source, hyperlink.identifier,
                filename)
        self._db.commit()
        return resource.representation
Exemple #20
0
    def mirror_hyperlink(self, hyperlink):
        resource = hyperlink.resource
        if not resource.representation:
            resource.representation, cached = Representation.get(
                self._db, resource.url, max_age=self.ONE_YEAR)
            representation = resource.representation
            if not representation.media_type or not representation.media_type.startswith(
                    'image/'):
                representation.fetch_exception = (
                    'Representation is not an image as expected.')
                return representation

            extension = self.image_extensions_for_types.get(
                representation.media_type, '')
            filename = "cover" + extension
            representation.mirror_url = self.uploader.cover_image_url(
                hyperlink.data_source, hyperlink.identifier, filename)
        self._db.commit()
        return resource.representation
Exemple #21
0
    def lookup_by_identifier(self, identifier, processed_uris=set()):
        """Turn an Identifier into a JSON-LD document."""
        if identifier.type == Identifier.OCLC_WORK:
            foreign_type = 'work'
            url = self.WORK_BASE_URL
        elif identifier.type == Identifier.OCLC_NUMBER:
            foreign_type = "oclc"
            url = self.BASE_URL

        url = url % dict(id=identifier.identifier, type=foreign_type)
        if url in processed_uris:
            self.log.debug("SKIPPING %s, already processed.", url)
            return None, True
        processed_uris.add(url)
        representation, cached = Representation.get(self._db, url)
        try:
            data = jsonld.load_document(url)
        except Exception, e:
            self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e)
            return None, False
Exemple #22
0
    def open(self):
        if len(sys.argv) > 1:
            return open(sys.argv[1])

        url = Configuration.integration_url(
            Configuration.STAFF_PICKS_INTEGRATION, True
        )
        if not url.startswith('https://') or url.startswith('http://'):
            url = self.DEFAULT_URL_TEMPLATE % url
        self.log.info("Retrieving %s", url)
        representation, cached = Representation.get(
            self._db, url, do_get=Representation.browser_http_get,
            accept="text/csv", max_age=timedelta(days=1))
        if representation.status_code != 200:
            raise ValueError("Unexpected status code %s" % 
                             representation.status_code)
        if not representation.media_type.startswith("text/csv"):
            raise ValueError("Unexpected media type %s" % 
                             representation.media_type)
        return StringIO(representation.content)
    def open(self):
        if len(sys.argv) > 1:
            return open(sys.argv[1])

        url = Configuration.integration_url(
            Configuration.STAFF_PICKS_INTEGRATION, True
        )
        if not url.startswith('https://') or url.startswith('http://'):
            url = self.DEFAULT_URL_TEMPLATE % url
        self.log.info("Retrieving %s", url)
        representation, cached = Representation.get(
            self._db, url, do_get=Representation.browser_http_get,
            accept="text/csv", max_age=timedelta(days=1))
        if representation.status_code != 200:
            raise ValueError("Unexpected status code %s" % 
                             representation.status_code)
        if not representation.media_type.startswith("text/csv"):
            raise ValueError("Unexpected media type %s" % 
                             representation.media_type)
        return StringIO(representation.content)
Exemple #24
0
    def lookup_by_name(self, sort_name, display_name=None, do_get=None,
                       best_match=False):
        sort_name = sort_name or display_name
        # from OCLC tech support:
        # VIAF's SRU endpoint can only return a maximum number of 10 records
        # when the recordSchema is http://viaf.org/VIAFCluster
        maximum_records = 10 # viaf maximum that's not ignored
        page = 1
        contributor_candidates = []

        # limit ourselves to reading the first 500 viaf clusters, on the
        # assumption that search match quality is unlikely to be usable after that.
        for page in range (1, 51):
            start_record = 1 + maximum_records * (page-1)
            scope = 'local.personalNames'
            if is_corporate_name(sort_name):
                scope = 'local.corporateNames'

            url = self.SEARCH_URL.format(
                scope=scope, sort_name=sort_name.encode("utf8"),
                maximum_records=maximum_records, start_record=start_record
            )
            representation, cached = Representation.get(
                self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
            )
            xml = representation.content

            candidates = self.parser.parse_multiple(xml, sort_name, display_name, page)
            if not any(candidates):
                # Delete the representation so it's not cached.
                self._db.query(Representation).filter(
                    Representation.id==representation.id
                ).delete()
                # We ran out of clusters, so we can relax and move on to
                # ordering the returned results
                break

            contributor_candidates.extend(candidates)
            page += 1

        return self.select_best_match(contributor_candidates, sort_name)
Exemple #25
0
    def import_spine(self, file_data):
        """Import an RBdigital spine item as a Web Publication Manifest
        spine item.
        """
        href = file_data.get('downloadUrl')
        duration = file_data.get('minutes') * 60
        title = file_data.get('display')

        id = file_data.get('id')
        size = file_data.get('size')
        filename = file_data.get('filename')
        type = Representation.guess_media_type(filename)

        extra = {}
        for k, v, transform in (
            ('id', 'rbdigital:id', str),
            ('size', 'schema:contentSize', lambda x: x),
            ('minutes', 'duration', lambda x: x * 60),
        ):
            if k in file_data:
                extra[v] = transform(file_data[k])
        self.add_spine(href, type, title, **extra)
Exemple #26
0
    def test_mirror(self,
                    name,
                    uploader_class,
                    bucket_type,
                    bucket_name,
                    open_access,
                    settings=None):
        # Arrange
        book_title = "1234567890"
        book_content = "1234567890"
        identifier = Identifier(type=Identifier.ISBN, identifier=book_title)
        representation = Representation(
            content=book_content, media_type=Representation.EPUB_MEDIA_TYPE)
        buckets = {
            bucket_type: bucket_name,
        }

        if settings:
            settings.update(buckets)
        else:
            settings = buckets

        s3_uploader = self._create_s3_uploader(uploader_class=uploader_class,
                                               **settings)

        self.minio_s3_client.create_bucket(Bucket=bucket_name)

        # Act
        book_url = s3_uploader.book_url(identifier, open_access=open_access)
        s3_uploader.mirror_one(representation, book_url)

        # Assert
        response = self.minio_s3_client.list_objects(Bucket=bucket_name)
        assert "Contents" in response
        assert len(response["Contents"]) == 1

        [object] = response["Contents"]

        assert object["Key"] == "ISBN/{0}.epub".format(book_title)
Exemple #27
0
    def lookup(self, identifier):
        """Requests NoveList metadata for a particular identifier

        :return: Metadata object or None
        """

        client_identifier = identifier.urn
        if identifier.type != Identifier.ISBN:
            return self.lookup_equivalent_isbns(identifier)

        params = dict(
            ClientIdentifier=client_identifier, ISBN=identifier.identifier,
            version=self.version, profile=self.profile, password=self.password
        )
        url = self._build_query(params)
        self.log.debug("NoveList lookup: %s", url)
        representation, from_cache = Representation.cacheable_post(
            self._db, unicode(url), params,
            max_age=self.MAX_REPRESENTATION_AGE,
            response_reviewer=self.review_response
        )

        return self.lookup_info_to_metadata(representation)
Exemple #28
0
 def lookup_by(self, **kwargs):
     """Perform an OCLC Classify lookup."""
     query_string = self.query_string(**kwargs)
     url = self.BASE_URL + query_string
     representation, cached = Representation.get(self._db, url)
     return representation.content
Exemple #29
0
 def lookup_by(self, **kwargs):
     """Perform an OCLC Classify lookup."""
     query_string = self.query_string(**kwargs)
     url = self.BASE_URL + query_string
     representation, cached = Representation.get(self._db, url)
     return representation.content
    def improve_description(self, id, metadata):
        """Improve the description associated with a book,
        if possible.

        This involves fetching an alternate OPDS entry that might
        contain more detailed descriptions than those available in the
        main feed.
        """
        alternate_links = []
        existing_descriptions = []
        everything_except_descriptions = []
        for x in metadata.links:
            if (x.rel == Hyperlink.ALTERNATE and x.href
                and x.media_type == OPDSFeed.ENTRY_TYPE):
                alternate_links.append(x)
            if x.rel == Hyperlink.DESCRIPTION:
                existing_descriptions.append((x.media_type, x.content))
            else:
                everything_except_descriptions.append(x)

        better_descriptions = []
        for alternate_link in alternate_links:
            # There should only be one alternate link, but we'll keep
            # processing them until we get a good description.

            # Fetch the alternate entry.
            representation, is_new = Representation.get(
                self._db, alternate_link.href, max_age=self.THIRTY_DAYS,
                do_get=self.http_get
            )

            if representation.status_code != 200:
                continue

            # Parse the alternate entry with feedparser and run it through
            # data_detail_for_feedparser_entry().
            parsed = feedparser.parse(representation.content)
            if len(parsed['entries']) != 1:
                # This is supposed to be a single entry, and it's not.
                continue
            [entry] = parsed['entries']
            data_source = self.data_source
            detail_id, new_detail, failure = self.data_detail_for_feedparser_entry(
                entry, data_source
            )
            if failure:
                # There was a problem parsing the entry.
                self.log.error(failure.exception)
                continue

            # TODO: Ideally we could verify that detail_id == id, but
            # right now they are always different -- one is an HTTPS
            # URI and one is an HTTP URI. So we omit this step and
            # assume the documents at both ends of the 'alternate'
            # link identify the same resource.

            # Find any descriptions present in the alternate view which
            # are not present in the original.
            new_descriptions = [
                x for x in new_detail['links']
                if x.rel == Hyperlink.DESCRIPTION
                and (x.media_type, x.content) not in existing_descriptions
            ]

            if new_descriptions:
                # Replace old descriptions with new descriptions.
                metadata.links = (
                    everything_except_descriptions + new_descriptions
                )
                break

        return metadata
Exemple #31
0
    def lookup_by_name(self,
                       sort_name,
                       display_name=None,
                       do_get=None,
                       known_titles=None):
        """
        Asks VIAF for a list of author clusters, matching the passed-in 
        author name.  Selects the cluster we deem the best match for 
        the author we mean.

        :param sort_name: Author name in Last, First format.
        :param display_name: Author name in First Last format.
        :param do_get: Ask Representation to use Http GET?
        :param known_titles: A list of titles we know this author wrote.
        :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData.
        """
        author_name = sort_name or display_name
        # from OCLC tech support:
        # VIAF's SRU endpoint can only return a maximum number of 10 records
        # when the recordSchema is http://viaf.org/VIAFCluster
        maximum_records = 10  # viaf maximum that's not ignored
        page = 1
        contributor_candidates = []

        # limit ourselves to reading the first 500 viaf clusters, on the
        # assumption that search match quality is unlikely to be usable after that.
        for page in range(1, 51):
            start_record = 1 + maximum_records * (page - 1)
            scope = 'local.personalNames'
            if is_corporate_name(author_name):
                scope = 'local.corporateNames'

            url = self.SEARCH_URL.format(
                scope=scope,
                author_name=author_name.encode("utf8"),
                maximum_records=maximum_records,
                start_record=start_record)
            representation, cached = Representation.get(
                self._db,
                url,
                do_get=do_get,
                max_age=self.REPRESENTATION_MAX_AGE)
            xml = representation.content

            candidates = self.parser.parse_multiple(xml, sort_name,
                                                    display_name, page)
            if not any(candidates):
                # Delete the representation so it's not cached.
                self._db.query(Representation).filter(
                    Representation.id == representation.id).delete()
                # We ran out of clusters, so we can relax and move on to
                # ordering the returned results
                break

            contributor_candidates.extend(candidates)
            page += 1

        best_match = self.select_best_match(candidates=contributor_candidates,
                                            working_sort_name=author_name,
                                            known_titles=known_titles)

        return best_match
Exemple #32
0
    def improve_description(self, id, metadata):
        """Improve the description associated with a book,
        if possible.

        This involves fetching an alternate OPDS entry that might
        contain more detailed descriptions than those available in the
        main feed.
        """
        alternate_links = []
        existing_descriptions = []
        everything_except_descriptions = []
        for x in metadata.links:
            if (x.rel == Hyperlink.ALTERNATE and x.href
                    and x.media_type == OPDSFeed.ENTRY_TYPE):
                alternate_links.append(x)
            if x.rel == Hyperlink.DESCRIPTION:
                existing_descriptions.append((x.media_type, x.content))
            else:
                everything_except_descriptions.append(x)

        better_descriptions = []
        for alternate_link in alternate_links:
            # There should only be one alternate link, but we'll keep
            # processing them until we get a good description.

            # Fetch the alternate entry.
            representation, is_new = Representation.get(
                self._db,
                alternate_link.href,
                max_age=self.THIRTY_DAYS,
                do_get=self.http_get)

            if representation.status_code != 200:
                continue

            # Parse the alternate entry with feedparser and run it through
            # data_detail_for_feedparser_entry().
            parsed = feedparser.parse(representation.content)
            if len(parsed['entries']) != 1:
                # This is supposed to be a single entry, and it's not.
                continue
            [entry] = parsed['entries']
            data_source = self.data_source
            detail_id, new_detail, failure = self.data_detail_for_feedparser_entry(
                entry, data_source)
            if failure:
                # There was a problem parsing the entry.
                self.log.error(failure.exception)
                continue

            # TODO: Ideally we could verify that detail_id == id, but
            # right now they are always different -- one is an HTTPS
            # URI and one is an HTTP URI. So we omit this step and
            # assume the documents at both ends of the 'alternate'
            # link identify the same resource.

            # Find any descriptions present in the alternate view which
            # are not present in the original.
            new_descriptions = [
                x for x in new_detail['links']
                if x.rel == Hyperlink.DESCRIPTION and (
                    x.media_type, x.content) not in existing_descriptions
            ]

            if new_descriptions:
                # Replace old descriptions with new descriptions.
                metadata.links = (everything_except_descriptions +
                                  new_descriptions)
                break

        return metadata