Beispiel #1
0
    def request(self, path, identifier=None, max_age=LIST_MAX_AGE):
        if not path.startswith(self.BASE_URL):
            if not path.startswith("/"):
                path = "/" + path
            url = self.BASE_URL + path
        else:
            url = path
        joiner = '?'
        if '?' in url:
            joiner = '&'
        url += joiner + "api-key=" + self.api_key
        representation, cached = Representation.get(
            self._db, url, do_get=self.do_get, max_age=max_age, debug=True,
            pause_before=0.1)
        status = representation.status_code
        if status == 200:
            # Everything's fine.
            content = json.loads(representation.content)
            return content

        diagnostic = "Response from %s was: %r" % (
            url, representation.content
        )

        if status == 403:
            raise IntegrationException(
                "API authentication failed",
                "API key is most likely wrong. %s" % diagnostic
            )
        else:
            raise IntegrationException(
                "Unknown API error (status %s)" % status, diagnostic
            )
Beispiel #2
0
    def request(self, path, identifier=None, max_age=LIST_MAX_AGE):
        if not path.startswith(self.BASE_URL):
            if not path.startswith("/"):
                path = "/" + path
            url = self.BASE_URL + path
        else:
            url = path
        joiner = '?'
        if '?' in url:
            joiner = '&'
        url += joiner + "api-key=" + self.api_key
        representation, cached = Representation.get(self._db,
                                                    url,
                                                    do_get=self.do_get,
                                                    max_age=max_age,
                                                    debug=True,
                                                    pause_before=0.1)
        status = representation.status_code
        if status == 200:
            # Everything's fine.
            content = json.loads(representation.content)
            return content

        diagnostic = "Response from %s was: %r" % (url, representation.content)

        if status == 403:
            raise IntegrationException(
                "API authentication failed",
                "API key is most likely wrong. %s" % diagnostic)
        else:
            raise IntegrationException(
                "Unknown API error (status %s)" % status, diagnostic)
    def lookup_by_viaf(self, viaf, working_sort_name=None,
                       working_display_name=None, do_get=None):
        url = self.LOOKUP_URL % dict(viaf=viaf)
        r, cached = Representation.get(
            self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
        )

        xml = r.content
        return self.parser.parse(xml, working_sort_name, working_display_name)
    def lookup_by_name(self, sort_name, display_name=None, do_get=None,
                       known_titles=None):
        """
        Asks VIAF for a list of author clusters, matching the passed-in
        author name.  Selects the cluster we deem the best match for
        the author we mean.

        :param sort_name: Author name in Last, First format.
        :param display_name: Author name in First Last format.
        :param do_get: Ask Representation to use Http GET?
        :param known_titles: A list of titles we know this author wrote.
        :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData.
        """
        author_name = sort_name or display_name
        # from OCLC tech support:
        # VIAF's SRU endpoint can only return a maximum number of 10 records
        # when the recordSchema is http://viaf.org/VIAFCluster
        maximum_records = 10 # viaf maximum that's not ignored
        page = 1
        contributor_candidates = []

        # limit ourselves to reading the first 500 viaf clusters, on the
        # assumption that search match quality is unlikely to be usable after that.
        for page in range (1, 51):
            start_record = 1 + maximum_records * (page-1)
            scope = 'local.personalNames'
            if is_corporate_name(author_name):
                scope = 'local.corporateNames'

            url = self.SEARCH_URL.format(
                scope=scope, author_name=author_name.encode("utf8"),
                maximum_records=maximum_records, start_record=start_record
            )
            representation, cached = Representation.get(
                self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
            )
            xml = representation.content

            candidates = self.parser.parse_multiple(xml, sort_name, display_name, page)
            if not any(candidates):
                # Delete the representation so it's not cached.
                self._db.query(Representation).filter(
                    Representation.id==representation.id
                ).delete()
                # We ran out of clusters, so we can relax and move on to
                # ordering the returned results
                break

            contributor_candidates.extend(candidates)
            page += 1

        best_match = self.select_best_match(candidates=contributor_candidates,
            working_sort_name=author_name, known_titles=known_titles)

        return best_match
    def get_jsonld(self, url):
        representation, cached = Representation.get(self._db, url)
        try:
            data = jsonld.load_document(url)
        except Exception as e:
            self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e)
            return None, False

        if cached and not representation.content:
            representation, cached = Representation.get(
                self._db, url, max_age=0)

        if not representation.content:
            return None, False
        
        doc = {
            'contextUrl': None,
            'documentUrl': url,
            'document': representation.content.decode('utf8')
        }
        return doc, cached
Beispiel #6
0
    def get_jsonld(self, url):
        representation, cached = Representation.get(self._db, url)
        try:
            data = jsonld.load_document(url)
        except Exception as e:
            self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e)
            return None, False

        if cached and not representation.content:
            representation, cached = Representation.get(self._db,
                                                        url,
                                                        max_age=0)

        if not representation.content:
            return None, False

        doc = {
            'contextUrl': None,
            'documentUrl': url,
            'document': representation.content.decode('utf8')
        }
        return doc, cached
    def lookup_name_title(self, viaf, do_get=None):
        url = self.LOOKUP_URL % dict(viaf=viaf)
        r, cached = Representation.get(
            self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
        )

        xml = r.content
        cluster = etree.fromstring(xml, parser=etree.XMLParser(recover=True))

        titles = []
        for potential_title in self.parser.name_titles_for_cluster(cluster):
            titles.append(potential_title)
        return titles
Beispiel #8
0
    def lookup_by_viaf(self,
                       viaf,
                       working_sort_name=None,
                       working_display_name=None,
                       do_get=None):
        url = self.LOOKUP_URL % dict(viaf=viaf)
        r, cached = Representation.get(self._db,
                                       url,
                                       do_get=do_get,
                                       max_age=self.REPRESENTATION_MAX_AGE)

        xml = r.content
        return self.parser.parse(xml, working_sort_name, working_display_name)
Beispiel #9
0
    def lookup_name_title(self, viaf, do_get=None):
        url = self.LOOKUP_URL % dict(viaf=viaf)
        r, cached = Representation.get(self._db,
                                       url,
                                       do_get=do_get,
                                       max_age=self.REPRESENTATION_MAX_AGE)

        xml = r.content
        cluster = etree.fromstring(xml, parser=etree.XMLParser(recover=True))

        titles = []
        for potential_title in self.parser.name_titles_for_cluster(cluster):
            titles.append(potential_title)
        return titles
Beispiel #10
0
 def request(self, path, identifier=None, max_age=LIST_MAX_AGE):
     if not path.startswith(self.BASE_URL):
         if not path.startswith("/"):
             path = "/" + path
         url = self.BASE_URL + path
     else:
         url = path
     joiner = '?'
     if '?' in url:
         joiner = '&'
     url += joiner + "api-key=" + self.api_key
     representation, cached = Representation.get(
         self._db, url, do_get=self.do_get, max_age=max_age, debug=True,
         pause_before=0.1)
     content = json.loads(representation.content)
     return content
Beispiel #11
0
    def oclc_number_for_isbn(self, isbn):
        """Turn an ISBN identifier into an OCLC Number identifier."""
        url = self.ISBN_BASE_URL % dict(id=isbn.identifier)
        representation, cached = Representation.get(
            self._db, url, Representation.http_get_no_redirect)
        if not representation.location:
            raise IOError(
                "Expected %s to redirect, but couldn't find location." % url)

        location = representation.location
        match = self.URI_WITH_OCLC_NUMBER.match(location)
        if not match:
            raise IOError(
                "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s"
                % location)
        oclc_number = match.groups()[0]
        return Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER,
                                         oclc_number)[0]
Beispiel #12
0
    def oclc_number_for_isbn(self, isbn):
        """Turn an ISBN identifier into an OCLC Number identifier."""
        url = self.ISBN_BASE_URL % dict(id=isbn.identifier)
        representation, cached = Representation.get(
            self._db, url, Representation.http_get_no_redirect)
        if not representation.location:
            raise IOError(
                "Expected %s to redirect, but couldn't find location." % url
            )

        location = representation.location
        match = self.URI_WITH_OCLC_NUMBER.match(location)
        if not match:
            raise IOError(
                "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s" % location)
        oclc_number = match.groups()[0]
        return Identifier.for_foreign_id(
            self._db, Identifier.OCLC_NUMBER, oclc_number)[0]
Beispiel #13
0
    def mirror_hyperlink(self, hyperlink):
        resource = hyperlink.resource
        if not resource.representation:
            resource.representation, cached = Representation.get(
                self._db, resource.url, max_age=self.ONE_YEAR)
            representation = resource.representation
            if not representation.media_type or not representation.media_type.startswith(
                    'image/'):
                representation.fetch_exception = (
                    'Representation is not an image as expected.')
                return representation

            extension = self.image_extensions_for_types.get(
                representation.media_type, '')
            filename = "cover" + extension
            representation.mirror_url = self.uploader.cover_image_url(
                hyperlink.data_source, hyperlink.identifier, filename)
        self._db.commit()
        return resource.representation
Beispiel #14
0
    def mirror_hyperlink(self, hyperlink):
        resource = hyperlink.resource
        if not resource.representation:
            resource.representation, cached = Representation.get(
                self._db, resource.url, max_age=self.ONE_YEAR)
            representation = resource.representation
            if not representation.media_type or not representation.media_type.startswith('image/'):
                representation.fetch_exception = (
                    'Representation is not an image as expected.')
                return representation

            extension = self.image_extensions_for_types.get(
                representation.media_type, '')
            filename = "cover" + extension
            representation.mirror_url = self.uploader.cover_image_url(
                hyperlink.data_source, hyperlink.identifier,
                filename)
        self._db.commit()
        return resource.representation
Beispiel #15
0
    def lookup_by_identifier(self, identifier, processed_uris=set()):
        """Turn an Identifier into a JSON-LD document."""
        if identifier.type == Identifier.OCLC_WORK:
            foreign_type = 'work'
            url = self.WORK_BASE_URL
        elif identifier.type == Identifier.OCLC_NUMBER:
            foreign_type = "oclc"
            url = self.BASE_URL

        url = url % dict(id=identifier.identifier, type=foreign_type)
        if url in processed_uris:
            self.log.debug("SKIPPING %s, already processed.", url)
            return None, True
        processed_uris.add(url)
        representation, cached = Representation.get(self._db, url)
        try:
            data = jsonld.load_document(url)
        except Exception, e:
            self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e)
            return None, False
Beispiel #16
0
    def open(self):
        if len(sys.argv) > 1:
            return open(sys.argv[1])

        url = Configuration.integration_url(
            Configuration.STAFF_PICKS_INTEGRATION, True
        )
        if not url.startswith('https://') or url.startswith('http://'):
            url = self.DEFAULT_URL_TEMPLATE % url
        self.log.info("Retrieving %s", url)
        representation, cached = Representation.get(
            self._db, url, do_get=Representation.browser_http_get,
            accept="text/csv", max_age=timedelta(days=1))
        if representation.status_code != 200:
            raise ValueError("Unexpected status code %s" % 
                             representation.status_code)
        if not representation.media_type.startswith("text/csv"):
            raise ValueError("Unexpected media type %s" % 
                             representation.media_type)
        return StringIO(representation.content)
Beispiel #17
0
    def open(self):
        if len(sys.argv) > 1:
            return open(sys.argv[1])

        url = Configuration.integration_url(
            Configuration.STAFF_PICKS_INTEGRATION, True
        )
        if not url.startswith('https://') or url.startswith('http://'):
            url = self.DEFAULT_URL_TEMPLATE % url
        self.log.info("Retrieving %s", url)
        representation, cached = Representation.get(
            self._db, url, do_get=Representation.browser_http_get,
            accept="text/csv", max_age=timedelta(days=1))
        if representation.status_code != 200:
            raise ValueError("Unexpected status code %s" % 
                             representation.status_code)
        if not representation.media_type.startswith("text/csv"):
            raise ValueError("Unexpected media type %s" % 
                             representation.media_type)
        return StringIO(representation.content)
Beispiel #18
0
    def lookup_by_name(self, sort_name, display_name=None, do_get=None,
                       best_match=False):
        sort_name = sort_name or display_name
        # from OCLC tech support:
        # VIAF's SRU endpoint can only return a maximum number of 10 records
        # when the recordSchema is http://viaf.org/VIAFCluster
        maximum_records = 10 # viaf maximum that's not ignored
        page = 1
        contributor_candidates = []

        # limit ourselves to reading the first 500 viaf clusters, on the
        # assumption that search match quality is unlikely to be usable after that.
        for page in range (1, 51):
            start_record = 1 + maximum_records * (page-1)
            scope = 'local.personalNames'
            if is_corporate_name(sort_name):
                scope = 'local.corporateNames'

            url = self.SEARCH_URL.format(
                scope=scope, sort_name=sort_name.encode("utf8"),
                maximum_records=maximum_records, start_record=start_record
            )
            representation, cached = Representation.get(
                self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
            )
            xml = representation.content

            candidates = self.parser.parse_multiple(xml, sort_name, display_name, page)
            if not any(candidates):
                # Delete the representation so it's not cached.
                self._db.query(Representation).filter(
                    Representation.id==representation.id
                ).delete()
                # We ran out of clusters, so we can relax and move on to
                # ordering the returned results
                break

            contributor_candidates.extend(candidates)
            page += 1

        return self.select_best_match(contributor_candidates, sort_name)
Beispiel #19
0
 def lookup_by(self, **kwargs):
     """Perform an OCLC Classify lookup."""
     query_string = self.query_string(**kwargs)
     url = self.BASE_URL + query_string
     representation, cached = Representation.get(self._db, url)
     return representation.content
Beispiel #20
0
 def lookup_by(self, **kwargs):
     """Perform an OCLC Classify lookup."""
     query_string = self.query_string(**kwargs)
     url = self.BASE_URL + query_string
     representation, cached = Representation.get(self._db, url)
     return representation.content
Beispiel #21
0
    def lookup_by_name(self,
                       sort_name,
                       display_name=None,
                       do_get=None,
                       known_titles=None):
        """
        Asks VIAF for a list of author clusters, matching the passed-in 
        author name.  Selects the cluster we deem the best match for 
        the author we mean.

        :param sort_name: Author name in Last, First format.
        :param display_name: Author name in First Last format.
        :param do_get: Ask Representation to use Http GET?
        :param known_titles: A list of titles we know this author wrote.
        :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData.
        """
        author_name = sort_name or display_name
        # from OCLC tech support:
        # VIAF's SRU endpoint can only return a maximum number of 10 records
        # when the recordSchema is http://viaf.org/VIAFCluster
        maximum_records = 10  # viaf maximum that's not ignored
        page = 1
        contributor_candidates = []

        # limit ourselves to reading the first 500 viaf clusters, on the
        # assumption that search match quality is unlikely to be usable after that.
        for page in range(1, 51):
            start_record = 1 + maximum_records * (page - 1)
            scope = 'local.personalNames'
            if is_corporate_name(author_name):
                scope = 'local.corporateNames'

            url = self.SEARCH_URL.format(
                scope=scope,
                author_name=author_name.encode("utf8"),
                maximum_records=maximum_records,
                start_record=start_record)
            representation, cached = Representation.get(
                self._db,
                url,
                do_get=do_get,
                max_age=self.REPRESENTATION_MAX_AGE)
            xml = representation.content

            candidates = self.parser.parse_multiple(xml, sort_name,
                                                    display_name, page)
            if not any(candidates):
                # Delete the representation so it's not cached.
                self._db.query(Representation).filter(
                    Representation.id == representation.id).delete()
                # We ran out of clusters, so we can relax and move on to
                # ordering the returned results
                break

            contributor_candidates.extend(candidates)
            page += 1

        best_match = self.select_best_match(candidates=contributor_candidates,
                                            working_sort_name=author_name,
                                            known_titles=known_titles)

        return best_match
    def improve_description(self, id, metadata):
        """Improve the description associated with a book,
        if possible.

        This involves fetching an alternate OPDS entry that might
        contain more detailed descriptions than those available in the
        main feed.
        """
        alternate_links = []
        existing_descriptions = []
        everything_except_descriptions = []
        for x in metadata.links:
            if (x.rel == Hyperlink.ALTERNATE and x.href
                and x.media_type == OPDSFeed.ENTRY_TYPE):
                alternate_links.append(x)
            if x.rel == Hyperlink.DESCRIPTION:
                existing_descriptions.append((x.media_type, x.content))
            else:
                everything_except_descriptions.append(x)

        better_descriptions = []
        for alternate_link in alternate_links:
            # There should only be one alternate link, but we'll keep
            # processing them until we get a good description.

            # Fetch the alternate entry.
            representation, is_new = Representation.get(
                self._db, alternate_link.href, max_age=self.THIRTY_DAYS,
                do_get=self.http_get
            )

            if representation.status_code != 200:
                continue

            # Parse the alternate entry with feedparser and run it through
            # data_detail_for_feedparser_entry().
            parsed = feedparser.parse(representation.content)
            if len(parsed['entries']) != 1:
                # This is supposed to be a single entry, and it's not.
                continue
            [entry] = parsed['entries']
            data_source = self.data_source
            detail_id, new_detail, failure = self.data_detail_for_feedparser_entry(
                entry, data_source
            )
            if failure:
                # There was a problem parsing the entry.
                self.log.error(failure.exception)
                continue

            # TODO: Ideally we could verify that detail_id == id, but
            # right now they are always different -- one is an HTTPS
            # URI and one is an HTTP URI. So we omit this step and
            # assume the documents at both ends of the 'alternate'
            # link identify the same resource.

            # Find any descriptions present in the alternate view which
            # are not present in the original.
            new_descriptions = [
                x for x in new_detail['links']
                if x.rel == Hyperlink.DESCRIPTION
                and (x.media_type, x.content) not in existing_descriptions
            ]

            if new_descriptions:
                # Replace old descriptions with new descriptions.
                metadata.links = (
                    everything_except_descriptions + new_descriptions
                )
                break

        return metadata
Beispiel #23
0
    def improve_description(self, id, metadata):
        """Improve the description associated with a book,
        if possible.

        This involves fetching an alternate OPDS entry that might
        contain more detailed descriptions than those available in the
        main feed.
        """
        alternate_links = []
        existing_descriptions = []
        everything_except_descriptions = []
        for x in metadata.links:
            if (x.rel == Hyperlink.ALTERNATE and x.href
                    and x.media_type == OPDSFeed.ENTRY_TYPE):
                alternate_links.append(x)
            if x.rel == Hyperlink.DESCRIPTION:
                existing_descriptions.append((x.media_type, x.content))
            else:
                everything_except_descriptions.append(x)

        better_descriptions = []
        for alternate_link in alternate_links:
            # There should only be one alternate link, but we'll keep
            # processing them until we get a good description.

            # Fetch the alternate entry.
            representation, is_new = Representation.get(
                self._db,
                alternate_link.href,
                max_age=self.THIRTY_DAYS,
                do_get=self.http_get)

            if representation.status_code != 200:
                continue

            # Parse the alternate entry with feedparser and run it through
            # data_detail_for_feedparser_entry().
            parsed = feedparser.parse(representation.content)
            if len(parsed['entries']) != 1:
                # This is supposed to be a single entry, and it's not.
                continue
            [entry] = parsed['entries']
            data_source = self.data_source
            detail_id, new_detail, failure = self.data_detail_for_feedparser_entry(
                entry, data_source)
            if failure:
                # There was a problem parsing the entry.
                self.log.error(failure.exception)
                continue

            # TODO: Ideally we could verify that detail_id == id, but
            # right now they are always different -- one is an HTTPS
            # URI and one is an HTTP URI. So we omit this step and
            # assume the documents at both ends of the 'alternate'
            # link identify the same resource.

            # Find any descriptions present in the alternate view which
            # are not present in the original.
            new_descriptions = [
                x for x in new_detail['links']
                if x.rel == Hyperlink.DESCRIPTION and (
                    x.media_type, x.content) not in existing_descriptions
            ]

            if new_descriptions:
                # Replace old descriptions with new descriptions.
                metadata.links = (everything_except_descriptions +
                                  new_descriptions)
                break

        return metadata