Python ContentCafeAPI Examples, content_cafe.ContentCafeAPI Python Examples

Example #1

0

Show file

File: test_content_cafe.py Project: wjzhu-class/metadata_wrangler

 def setup(self):
     super(TestContentCafeAPI, self).setup()
     self.http = DummyHTTPClient()
     self.soap = MockSOAPClient(popularity_value=5)
     self.api = ContentCafeAPI(self._db, 'uid', 'pw', self.soap,
                               self.http.do_get)
     self.identifier = self._identifier(identifier_type=Identifier.ISBN)
     self.args = dict(userid=self.api.user_id,
                      password=self.api.password,
                      isbn=self.identifier.identifier)

Example #2

0

Show file

File: test_content_cafe.py Project: rskm1/metadata_wrangler

    def test_from_config(self):
        # Without an integration, an error is raised.
        assert_raises(CannotLoadConfiguration, ContentCafeAPI.from_config,
                      self._db, object())

        # With incomplete integrations, an error is raised.
        integration = self._external_integration(
            ExternalIntegration.CONTENT_CAFE,
            goal=ExternalIntegration.METADATA_GOAL,
            username=u'yup')
        assert_raises(CannotLoadConfiguration, ContentCafeAPI.from_config,
                      self._db, object())

        integration.username = None
        integration.password = u'yurp'
        assert_raises(CannotLoadConfiguration, ContentCafeAPI.from_config,
                      self._db, object())

        integration.username = u'yup'
        result = ContentCafeAPI.from_config(
            self._db,
            None,
            uploader=DummyS3Uploader(),
            soap_client=DummyContentCafeSOAPClient())
        eq_(True, isinstance(result, ContentCafeAPI))

Example #3

0

Show file

File: monitor.py Project: NYPL-Simplified/metadata_wrangler

 def __init__(self, _db, batch_size=100, interval_seconds=3600*48):
     super(ContentCafeDemandMeasurementSweep, self).__init__(
         _db,
         "Content Cafe demand measurement sweep",
         interval_seconds)
     self.client = ContentCafeAPI(_db, mirror=None)
     self.batch_size = batch_size

Example #4

0

Show file

File: test_content_cafe.py Project: NYPL-Simplified/metadata_wrangler

    def test_from_config(self):
        # Without an integration, an error is raised.
        assert_raises(
            CannotLoadConfiguration, ContentCafeAPI.from_config, self._db
        )

        # With incomplete integrations, an error is raised.
        integration = self._external_integration(
            ExternalIntegration.CONTENT_CAFE,
            goal=ExternalIntegration.METADATA_GOAL,
            username=u'yup'
        )
        assert_raises(
            CannotLoadConfiguration, ContentCafeAPI.from_config, self._db
        )

        integration.username = None
        integration.password = u'yurp'
        assert_raises(
            CannotLoadConfiguration, ContentCafeAPI.from_config, self._db
        )

        integration.username = u'yup'
        result = ContentCafeAPI.from_config(
            self._db, soap_client=object()
        )
        eq_(True, isinstance(result, ContentCafeAPI))

Example #5

0

Show file

File: test_content_cafe.py Project: NYPL-Simplified/metadata_wrangler

 def setup(self):
     super(TestContentCafeAPI, self).setup()
     self.http = DummyHTTPClient()
     self.soap = MockSOAPClient(popularity_value=5)
     self.api = ContentCafeAPI(
         self._db, 'uid', 'pw', self.soap, self.http.do_get
     )
     self.identifier = self._identifier(identifier_type=Identifier.ISBN)
     self.args = dict(userid=self.api.user_id, password=self.api.password,
                      isbn=self.identifier.identifier)

Example #6

0

Show file

File: test_content_cafe.py Project: rskm1/metadata_wrangler

 def test_constructor(self):
     """Just test that we can create the object."""
     uploader = DummyS3Uploader()
     soap_client = DummyContentCafeSOAPClient()
     api = ContentCafeAPI(self._db,
                          None,
                          "user_id",
                          "password",
                          uploader,
                          soap_client=soap_client)
     provider = ContentCafeCoverageProvider(self._db,
                                            api=api,
                                            uploader=uploader)

Example #7

0

Show file

File: monitor.py Project: NYPL-Simplified/metadata_wrangler

class ContentCafeDemandMeasurementSweep(IdentifierSweepMonitor):
    """Ensure that every ISBN directly associated with a commercial
    identifier has a recent demand measurement.

    :TODO: This misses a lot of ISBNs, since 3M and Axis ISBNs aren't
    directly associated with a commercial identifier.
    """

    def __init__(self, _db, batch_size=100, interval_seconds=3600*48):
        super(ContentCafeDemandMeasurementSweep, self).__init__(
            _db,
            "Content Cafe demand measurement sweep",
            interval_seconds)
        self.client = ContentCafeAPI(_db, mirror=None)
        self.batch_size = batch_size

    def identifier_query(self):
        # TODO: Outer join to Measurement. If measurement value is
        # None or less than a year old, skip it.
        input_identifier = aliased(Identifier)

        output_join_clause = Identifier.id==Equivalency.output_id
        input_join_clause = input_identifier.id==Equivalency.input_id

        qu = self._db.query(Identifier).join(
            Equivalency, output_join_clause).join(
                input_identifier, input_join_clause
            ).filter(Identifier.type==Identifier.ISBN).filter(
                input_identifier.type.in_(
                    [Identifier.OVERDRIVE_ID, Identifier.THREEM_ID,
                     Identifier.AXIS_360_ID])
            ).order_by(Identifier.id)
        return qu

    def process_identifier(self, identifier):
        isbn = identifier.identifier
        if isbn and (isbnlib.is_isbn10(isbn) or isbnlib.is_isbn13(isbn)):
            self.client.measure_popularity(identifier, self.client.ONE_YEAR_AGO)
        return True

Example #8

0

Show file

File: monitor.py Project: wjzhu-class/metadata_wrangler

class ContentCafeDemandMeasurementSweep(IdentifierSweepMonitor):
    """Ensure that every ISBN directly associated with a commercial
    identifier has a recent demand measurement.

    :TODO: This misses a lot of ISBNs, since 3M and Axis ISBNs aren't
    directly associated with a commercial identifier.
    """
    def __init__(self, _db, batch_size=100, interval_seconds=3600 * 48):
        super(ContentCafeDemandMeasurementSweep,
              self).__init__(_db, "Content Cafe demand measurement sweep",
                             interval_seconds)
        self.client = ContentCafeAPI(_db, mirror=None)
        self.batch_size = batch_size

    def identifier_query(self):
        # TODO: Outer join to Measurement. If measurement value is
        # None or less than a year old, skip it.
        input_identifier = aliased(Identifier)

        output_join_clause = Identifier.id == Equivalency.output_id
        input_join_clause = input_identifier.id == Equivalency.input_id

        qu = self._db.query(Identifier).join(
            Equivalency, output_join_clause).join(
                input_identifier, input_join_clause).filter(
                    Identifier.type == Identifier.ISBN).filter(
                        input_identifier.type.in_([
                            Identifier.OVERDRIVE_ID, Identifier.THREEM_ID,
                            Identifier.AXIS_360_ID
                        ])).order_by(Identifier.id)
        return qu

    def process_identifier(self, identifier):
        isbn = identifier.identifier
        if isbn and (isbnlib.is_isbn10(isbn) or isbnlib.is_isbn13(isbn)):
            self.client.measure_popularity(identifier,
                                           self.client.ONE_YEAR_AGO)
        return True

Example #9

0

Show file

File: test_coverage.py Project: rskm1/metadata_wrangler

    def test_providers_overdrive(self):
        # For an Overdrive collection...
        collection = MockOverdriveAPI.mock_collection(self._db)

        # In lieu of a proper mock API, create one that will crash
        # if it tries to make a real HTTP request.
        mock_content_cafe = ContentCafeAPI(self._db, None, object(), object(),
                                           self.uploader)
        resolver = IdentifierResolutionCoverageProvider(
            collection,
            overdrive_api_class=MockOverdriveAPI,
            content_cafe_api=mock_content_cafe,
            uploader=self.uploader)

        # We get three required coverage providers: Content Cafe, OCLC
        # Classify, and Overdrive.
        optional, [content_cafe, oclc_classify,
                   overdrive] = resolver.providers()
        eq_([], optional)
        assert isinstance(content_cafe, ContentCafeCoverageProvider)
        assert isinstance(oclc_classify, OCLCClassifyCoverageProvider)
        assert isinstance(overdrive, OverdriveBibliographicCoverageProvider)

Example #10

0

Show file

File: test_coverage.py Project: rskm1/metadata_wrangler

    def test_providers_opds(self):
        # For an OPDS collection that goes against the open-access content
        # server...
        self._default_collection.external_integration.set_setting(
            Collection.DATA_SOURCE_NAME_SETTING, DataSource.OA_CONTENT_SERVER)
        uploader = object()
        # In lieu of a proper mock API, create one that will crash
        # if it tries to make a real HTTP request.
        mock_content_cafe = ContentCafeAPI(self._db, None, object(), object(),
                                           self.uploader)
        resolver = IdentifierResolutionCoverageProvider(
            self._default_collection,
            content_cafe_api=mock_content_cafe,
            uploader=uploader)

        # We get three required coverage providers: Content Cafe, OCLC
        # Classify, and OPDS Lookup Protocol.
        optional, [content_cafe, oclc_classify, opds] = resolver.providers()
        eq_([], optional)
        assert isinstance(content_cafe, ContentCafeCoverageProvider)
        assert isinstance(oclc_classify, OCLCClassifyCoverageProvider)
        assert isinstance(opds, LookupClientCoverageProvider)
        eq_(mock_content_cafe, content_cafe.content_cafe)
        eq_(self._default_collection, opds.collection)

Example #11

0

Show file

File: monitor.py Project: wjzhu-class/metadata_wrangler

 def __init__(self, _db, batch_size=100, interval_seconds=3600 * 48):
     super(ContentCafeDemandMeasurementSweep,
           self).__init__(_db, "Content Cafe demand measurement sweep",
                          interval_seconds)
     self.client = ContentCafeAPI(_db, mirror=None)
     self.batch_size = batch_size

Example #12

0

Show file

File: test_content_cafe.py Project: NYPL-Simplified/metadata_wrangler

class TestContentCafeAPI(DatabaseTest):

    base_path = os.path.split(__file__)[0]
    resource_path = os.path.join(base_path, "files", "content_cafe")

    def data_file(self, path):
        """Return the contents of a test data file."""
        return open(os.path.join(self.resource_path, path)).read()

    def setup(self):
        super(TestContentCafeAPI, self).setup()
        self.http = DummyHTTPClient()
        self.soap = MockSOAPClient(popularity_value=5)
        self.api = ContentCafeAPI(
            self._db, 'uid', 'pw', self.soap, self.http.do_get
        )
        self.identifier = self._identifier(identifier_type=Identifier.ISBN)
        self.args = dict(userid=self.api.user_id, password=self.api.password,
                         isbn=self.identifier.identifier)

    def test_from_config(self):
        # Without an integration, an error is raised.
        assert_raises(
            CannotLoadConfiguration, ContentCafeAPI.from_config, self._db
        )

        # With incomplete integrations, an error is raised.
        integration = self._external_integration(
            ExternalIntegration.CONTENT_CAFE,
            goal=ExternalIntegration.METADATA_GOAL,
            username=u'yup'
        )
        assert_raises(
            CannotLoadConfiguration, ContentCafeAPI.from_config, self._db
        )

        integration.username = None
        integration.password = u'yurp'
        assert_raises(
            CannotLoadConfiguration, ContentCafeAPI.from_config, self._db
        )

        integration.username = u'yup'
        result = ContentCafeAPI.from_config(
            self._db, soap_client=object()
        )
        eq_(True, isinstance(result, ContentCafeAPI))

        # NOTE: We can't test the case where soap_client is not
        # mocked, because the ContentCafeSOAPClient constructor makes
        # a real HTTP request to load its WSDL file. We might be able
        # to improve this by seeing how mockable SudsClient is, or by
        # mocking ContentCafeAPISOAPClient.WSDL_URL as a file:// URL.

    def test_data_source(self):
        eq_(DataSource.CONTENT_CAFE, self.api.data_source.name)

    def test_create_metadata(self):

        class Mock(ContentCafeAPI):

            popularity_measurement = "a popularity measurement"
            annotate_calls = []

            def add_reviews(self, *args):
                self.add_reviews_called_with = args

            def add_descriptions(self, *args):
                self.add_descriptions_called_with = args

            def add_author_notes(self, *args):
                self.add_author_notes_called_with = args

            def add_excerpt(self, *args):
                self.add_excerpt_called_with = args

            def measure_popularity(self, *args):
                self.measure_popularity_called_with = args
                return self.popularity_measurement

            def is_suitable_image(self, image):
                self.is_suitable_image_called_with = image
                return True

        api = Mock(self._db, 'uid', 'pw', self.soap, self.http.do_get)
        m = api.create_metadata

        # First we will make a request for a cover image. If that
        # gives a 404 error, we return nothing and don't bother making
        # any more requests.
        self.http.queue_requests_response(404)
        eq_(None, m(self.identifier))
        request_url = self.http.requests.pop()
        image_url = api.image_url % self.args
        eq_(image_url, request_url)
        eq_([], self.http.requests)

        # If the cover image request succeeds, we turn it into a LinkData
        # and add it to a new Metadata object. We then pass the
        # Metadata object a number of other methods to get additional
        # information from Content Cafe.
        #
        # We then call measure_popularity, and add its return value
        # to Metadata.measurements.
        self.http.queue_requests_response(200, 'image/png', content='an image!')

        # Here's the result.
        metadata = m(self.identifier)

        # Here's the image LinkData.
        [image] = metadata.links
        eq_(Hyperlink.IMAGE, image.rel)
        eq_(image_url, image.href)
        eq_('image/png', image.media_type)
        eq_('an image!', image.content)

        # We ran the image through our mocked version of is_suitable_image,
        # and it said it was fine.
        eq_(image.content, api.is_suitable_image_called_with)

        # Here's the popularity measurement.
        eq_([api.popularity_measurement], metadata.measurements)

        # Confirm that the mock methods were called with the right
        # arguments -- their functionality is tested individually
        # below.
        expected_args = (metadata, self.identifier, self.args)
        for called_with in (
            api.add_reviews_called_with, api.add_descriptions_called_with,
            api.add_author_notes_called_with, api.add_excerpt_called_with,
        ):
            eq_(expected_args, called_with)
        eq_((self.identifier, api.ONE_YEAR_AGO),
            api.measure_popularity_called_with)

        # If measure_popularity returns nothing, metadata.measurements
        # will be left empty.
        api.popularity_measurement = None
        self.http.queue_requests_response(200, 'image/png', content='an image!')
        metadata = m(self.identifier)
        eq_([], metadata.measurements)

    def test_annotate_with_web_resources(self):
        metadata = Metadata(DataSource.CONTENT_CAFE)
        rel = self._str

        # We're going to be grabbing this URL and
        # scraping it.
        url_template = "http://url/%(arg1)s"
        args = dict(arg1='value')

        # A couple of useful functions for scraping.
        class MockScrapers(object):
            scrape_called = False
            explode_called = False
            def scrape(self, soup):
                self.scrape_called = True
                return [soup.find('content').string]

            def explode(self, soup):
                self.explode_called = True
                raise Exception("I'll never be called")
        scrapers = MockScrapers()

        # When the result of the HTTP request contains a certain phrase,
        # we don't even bother scraping.
        m = self.api.annotate_with_web_resources
        http = self.http
        http.queue_requests_response(
            200, 'text/html', content='There is no data!'
        )
        m(metadata, self.identifier, args, url_template, "no data!", rel,
          scrapers.explode)
        # We made the request but nothing happened.
        expect_url = url_template % args
        eq_(expect_url, self.http.requests.pop())
        eq_(False, scrapers.explode_called)
        eq_(None, metadata.title)
        eq_([], metadata.links)

        # Otherwise, we try to scrape.
        good_content = '<html><span class="PageHeader2">Book title</span><content>Here you go</content>'
        http.queue_requests_response(200, 'text/html', content=good_content)
        m(metadata, self.identifier, args, url_template, "no data!", rel,
          scrapers.scrape)
        eq_(True, scrapers.scrape_called)

        # We called _extract_title and took a Content Cafe title out
        # for the Metadata object.
        eq_("Book title", metadata.title)

        # Then we called mock_scrape, which gave us the content for
        # one LinkData.
        [link] = metadata.links
        eq_(rel, link.rel)
        eq_(None, link.href)
        eq_("text/html", link.media_type)
        eq_("Here you go", link.content)

    def test__extract_title(self):
        # Standalone test of the _extract_title helper method.

        def assert_title(title, expect):
            markup = '<html><span class="PageHeader2">%s</span><content>Description</content>' % title
            soup = BeautifulSoup(markup, 'lxml')
            eq_(expect, ContentCafeAPI._extract_title(soup))


        # A normal book title is successfully extracted.
        assert_title("A great book", "A great book")

        # A supposed title that's in KNOWN_BAD_TITLES is ignored.
        assert_title("No content currently exists for this item", None)

    def test_add_reviews(self):
        """Verify that add_reviews works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("reviews.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_reviews(metadata, self.identifier, self.args)

        # We extracted six reviews from the sample file.
        reviews = metadata.links
        eq_(6, len(reviews))
        assert all([x.rel==Hyperlink.REVIEW for x in reviews])
        assert "isn't a myth!" in reviews[0].content

        # We incidentally figured out the book's title.
        eq_("Shadow Thieves", metadata.title)

    def test_add_author_notes(self):
        """Verify that add_author_notes works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("author_notes.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_author_notes(metadata, self.identifier, self.args)

        [notes] = metadata.links
        eq_(Hyperlink.AUTHOR, notes.rel)
        assert 'Brenda researched turtles' in notes.content

        # We incidentally figured out the book's title.
        eq_("Franklin's Christmas Gift", metadata.title)

    def test_add_excerpt(self):
        """Verify that add_excerpt works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("excerpt.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_excerpt(metadata, self.identifier, self.args)

        [excerpt] = metadata.links
        eq_(Hyperlink.SAMPLE, excerpt.rel)
        assert 'Franklin loved his marbles.' in excerpt.content

        # We incidentally figured out the book's title.
        eq_("Franklin's Christmas Gift", metadata.title)

    def test_measure_popularity(self):
        """Verify that measure_popularity turns the output of
        a SOAP request into a MeasurementData.
        """
        cutoff = object()

        # Call it.
        result = self.api.measure_popularity(self.identifier, cutoff)

        # The SOAP client's estimated_popularity method was called.
        expect = (self.identifier.identifier, cutoff)
        eq_(expect, self.soap.estimated_popularity_calls.pop())

        # The result was turned into a MeasurementData.
        assert isinstance(result, MeasurementData)
        eq_(Measurement.POPULARITY, result.quantity_measured)
        eq_(self.soap.popularity_value, result.value)

        # If the SOAP API doesn't return a popularity value, no
        # MeasurementData is created.
        self.soap.popularity_value = None
        result = self.api.measure_popularity(self.identifier, cutoff)
        eq_(expect, self.soap.estimated_popularity_calls.pop())
        eq_(None, result)

    def test_is_suitable_image(self):
        # Images are rejected if we can tell they are Content Cafe's
        # stand-in images.
        m = ContentCafeAPI.is_suitable_image

        content = self.data_file("stand-in-image.png")
        eq_(False, m(content))

        # Otherwise, it's fine. We don't check that the image is
        # valid, only that it's not a stand-in image.
        eq_(True, m("I'm not a stand-in image."))

Example #13

0

Show file

File: test_content_cafe.py Project: NYPL-Simplified/metadata_wrangler

 def assert_title(title, expect):
     markup = '<html><span class="PageHeader2">%s</span><content>Description</content>' % title
     soup = BeautifulSoup(markup, 'lxml')
     eq_(expect, ContentCafeAPI._extract_title(soup))

Example #14

0

Show file

File: test_content_cafe.py Project: wjzhu-class/metadata_wrangler

class TestContentCafeAPI(DatabaseTest):

    base_path = os.path.split(__file__)[0]
    resource_path = os.path.join(base_path, "files", "content_cafe")

    def data_file(self, path):
        """Return the contents of a test data file."""
        return open(os.path.join(self.resource_path, path)).read()

    def setup(self):
        super(TestContentCafeAPI, self).setup()
        self.http = DummyHTTPClient()
        self.soap = MockSOAPClient(popularity_value=5)
        self.api = ContentCafeAPI(self._db, 'uid', 'pw', self.soap,
                                  self.http.do_get)
        self.identifier = self._identifier(identifier_type=Identifier.ISBN)
        self.args = dict(userid=self.api.user_id,
                         password=self.api.password,
                         isbn=self.identifier.identifier)

    def test_from_config(self):
        # Without an integration, an error is raised.
        assert_raises(CannotLoadConfiguration, ContentCafeAPI.from_config,
                      self._db)

        # With incomplete integrations, an error is raised.
        integration = self._external_integration(
            ExternalIntegration.CONTENT_CAFE,
            goal=ExternalIntegration.METADATA_GOAL,
            username=u'yup')
        assert_raises(CannotLoadConfiguration, ContentCafeAPI.from_config,
                      self._db)

        integration.username = None
        integration.password = u'yurp'
        assert_raises(CannotLoadConfiguration, ContentCafeAPI.from_config,
                      self._db)

        integration.username = u'yup'
        result = ContentCafeAPI.from_config(self._db, soap_client=object())
        eq_(True, isinstance(result, ContentCafeAPI))

        # NOTE: We can't test the case where soap_client is not
        # mocked, because the ContentCafeSOAPClient constructor makes
        # a real HTTP request to load its WSDL file. We might be able
        # to improve this by seeing how mockable SudsClient is, or by
        # mocking ContentCafeAPISOAPClient.WSDL_URL as a file:// URL.

    def test_data_source(self):
        eq_(DataSource.CONTENT_CAFE, self.api.data_source.name)

    def test_create_metadata(self):
        class Mock(ContentCafeAPI):

            popularity_measurement = "a popularity measurement"
            annotate_calls = []

            def add_reviews(self, *args):
                self.add_reviews_called_with = args

            def add_descriptions(self, *args):
                self.add_descriptions_called_with = args

            def add_author_notes(self, *args):
                self.add_author_notes_called_with = args

            def add_excerpt(self, *args):
                self.add_excerpt_called_with = args

            def measure_popularity(self, *args):
                self.measure_popularity_called_with = args
                return self.popularity_measurement

            def is_suitable_image(self, image):
                self.is_suitable_image_called_with = image
                return True

        api = Mock(self._db, 'uid', 'pw', self.soap, self.http.do_get)
        m = api.create_metadata

        # First we will make a request for a cover image. If that
        # gives a 404 error, we return nothing and don't bother making
        # any more requests.
        self.http.queue_requests_response(404)
        eq_(None, m(self.identifier))
        request_url = self.http.requests.pop()
        image_url = api.image_url % self.args
        eq_(image_url, request_url)
        eq_([], self.http.requests)

        # If the cover image request succeeds, we turn it into a LinkData
        # and add it to a new Metadata object. We then pass the
        # Metadata object a number of other methods to get additional
        # information from Content Cafe.
        #
        # We then call measure_popularity, and add its return value
        # to Metadata.measurements.
        self.http.queue_requests_response(200,
                                          'image/png',
                                          content='an image!')

        # Here's the result.
        metadata = m(self.identifier)

        # Here's the image LinkData.
        [image] = metadata.links
        eq_(Hyperlink.IMAGE, image.rel)
        eq_(image_url, image.href)
        eq_('image/png', image.media_type)
        eq_('an image!', image.content)

        # We ran the image through our mocked version of is_suitable_image,
        # and it said it was fine.
        eq_(image.content, api.is_suitable_image_called_with)

        # Here's the popularity measurement.
        eq_([api.popularity_measurement], metadata.measurements)

        # Confirm that the mock methods were called with the right
        # arguments -- their functionality is tested individually
        # below.
        expected_args = (metadata, self.identifier, self.args)
        for called_with in (
                api.add_reviews_called_with,
                api.add_descriptions_called_with,
                api.add_author_notes_called_with,
                api.add_excerpt_called_with,
        ):
            eq_(expected_args, called_with)
        eq_((self.identifier, api.ONE_YEAR_AGO),
            api.measure_popularity_called_with)

        # If measure_popularity returns nothing, metadata.measurements
        # will be left empty.
        api.popularity_measurement = None
        self.http.queue_requests_response(200,
                                          'image/png',
                                          content='an image!')
        metadata = m(self.identifier)
        eq_([], metadata.measurements)

    def test_annotate_with_web_resources(self):
        metadata = Metadata(DataSource.CONTENT_CAFE)
        rel = self._str

        # We're going to be grabbing this URL and
        # scraping it.
        url_template = "http://url/%(arg1)s"
        args = dict(arg1='value')

        # A couple of useful functions for scraping.
        class MockScrapers(object):
            scrape_called = False
            explode_called = False

            def scrape(self, soup):
                self.scrape_called = True
                return [soup.find('content').string]

            def explode(self, soup):
                self.explode_called = True
                raise Exception("I'll never be called")

        scrapers = MockScrapers()

        # When the result of the HTTP request contains a certain phrase,
        # we don't even bother scraping.
        m = self.api.annotate_with_web_resources
        http = self.http
        http.queue_requests_response(200,
                                     'text/html',
                                     content='There is no data!')
        m(metadata, self.identifier, args, url_template, "no data!", rel,
          scrapers.explode)
        # We made the request but nothing happened.
        expect_url = url_template % args
        eq_(expect_url, self.http.requests.pop())
        eq_(False, scrapers.explode_called)
        eq_(None, metadata.title)
        eq_([], metadata.links)

        # Otherwise, we try to scrape.
        good_content = '<html><span class="PageHeader2">Book title</span><content>Here you go</content>'
        http.queue_requests_response(200, 'text/html', content=good_content)
        m(metadata, self.identifier, args, url_template, "no data!", rel,
          scrapers.scrape)
        eq_(True, scrapers.scrape_called)

        # We called _extract_title and took a Content Cafe title out
        # for the Metadata object.
        eq_("Book title", metadata.title)

        # Then we called mock_scrape, which gave us the content for
        # one LinkData.
        [link] = metadata.links
        eq_(rel, link.rel)
        eq_(None, link.href)
        eq_("text/html", link.media_type)
        eq_("Here you go", link.content)

    def test__extract_title(self):
        # Standalone test of the _extract_title helper method.

        def assert_title(title, expect):
            markup = '<html><span class="PageHeader2">%s</span><content>Description</content>' % title
            soup = BeautifulSoup(markup, 'lxml')
            eq_(expect, ContentCafeAPI._extract_title(soup))

        # A normal book title is successfully extracted.
        assert_title("A great book", "A great book")

        # A supposed title that's in KNOWN_BAD_TITLES is ignored.
        assert_title("No content currently exists for this item", None)

    def test_add_reviews(self):
        """Verify that add_reviews works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("reviews.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_reviews(metadata, self.identifier, self.args)

        # We extracted six reviews from the sample file.
        reviews = metadata.links
        eq_(6, len(reviews))
        assert all([x.rel == Hyperlink.REVIEW for x in reviews])
        assert "isn't a myth!" in reviews[0].content

        # We incidentally figured out the book's title.
        eq_("Shadow Thieves", metadata.title)

    def test_add_author_notes(self):
        """Verify that add_author_notes works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("author_notes.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_author_notes(metadata, self.identifier, self.args)

        [notes] = metadata.links
        eq_(Hyperlink.AUTHOR, notes.rel)
        assert 'Brenda researched turtles' in notes.content

        # We incidentally figured out the book's title.
        eq_("Franklin's Christmas Gift", metadata.title)

    def test_add_excerpt(self):
        """Verify that add_excerpt works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("excerpt.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_excerpt(metadata, self.identifier, self.args)

        [excerpt] = metadata.links
        eq_(Hyperlink.SAMPLE, excerpt.rel)
        assert 'Franklin loved his marbles.' in excerpt.content

        # We incidentally figured out the book's title.
        eq_("Franklin's Christmas Gift", metadata.title)

    def test_measure_popularity(self):
        """Verify that measure_popularity turns the output of
        a SOAP request into a MeasurementData.
        """
        cutoff = object()

        # Call it.
        result = self.api.measure_popularity(self.identifier, cutoff)

        # The SOAP client's estimated_popularity method was called.
        expect = (self.identifier.identifier, cutoff)
        eq_(expect, self.soap.estimated_popularity_calls.pop())

        # The result was turned into a MeasurementData.
        assert isinstance(result, MeasurementData)
        eq_(Measurement.POPULARITY, result.quantity_measured)
        eq_(self.soap.popularity_value, result.value)

        # If the SOAP API doesn't return a popularity value, no
        # MeasurementData is created.
        self.soap.popularity_value = None
        result = self.api.measure_popularity(self.identifier, cutoff)
        eq_(expect, self.soap.estimated_popularity_calls.pop())
        eq_(None, result)

    def test_is_suitable_image(self):
        # Images are rejected if we can tell they are Content Cafe's
        # stand-in images.
        m = ContentCafeAPI.is_suitable_image

        content = self.data_file("stand-in-image.png")
        eq_(False, m(content))

        # Otherwise, it's fine. We don't check that the image is
        # valid, only that it's not a stand-in image.
        eq_(True, m("I'm not a stand-in image."))

Example #15

0

Show file

File: test_content_cafe.py Project: wjzhu-class/metadata_wrangler

 def assert_title(title, expect):
     markup = '<html><span class="PageHeader2">%s</span><content>Description</content>' % title
     soup = BeautifulSoup(markup, 'lxml')
     eq_(expect, ContentCafeAPI._extract_title(soup))