def test_handles_non_ascii_content(self): extractor = SnippetTextExtractor() # Both text and title unicode resource_info = ResourceInfo(metadata={'title': u'B\xe4ren'}, text=u'B\xe4rengraben', headers={}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'graben', extracted_value) self.assertIsInstance(extracted_value, unicode) # Both text and title utf-8 resource_info = ResourceInfo(metadata={'title': 'B\xc3\xa4ren'}, text='B\xc3\xa4rengraben', headers={}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'graben', extracted_value) self.assertIsInstance(extracted_value, unicode) # Mix of unicode and utf-8 resource_info = ResourceInfo(metadata={'title': u'B\xe4ren'}, text='B\xc3\xa4rengraben', headers={}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'graben', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_uid_is_different_for_different_urls(self): extractor = UIDExtractor() resource_info = ResourceInfo(url_info={'loc': 'http://example.org'}) uid1 = extractor.extract_value(resource_info) resource_info = ResourceInfo( url_info={'loc': 'http://example.org/foo'}) uid2 = extractor.extract_value(resource_info) self.assertNotEqual(uid1, uid2)
def test_maps_header_to_value(self): mapping = {'text/html': 'HTML', 'image/png': 'IMAGE'} extractor = HeaderMappingExtractor('content-type', mapping) resource_info = ResourceInfo(headers={'content-type': 'text/html'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals('HTML', extracted_value) self.assertIsInstance(extracted_value, unicode) resource_info = ResourceInfo(headers={'content-type': 'image/png'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals('IMAGE', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_returns_given_text(self): extractor = PlainTextExtractor() resource_info = ResourceInfo(text=u'foobar') extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'foobar', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_sets_text_from_converter_on_resource_info(self): converter = MagicMock() converter.extract_text = MagicMock(return_value=u'foo bar') resource_info = ResourceInfo() self._create_engine(resource_info=resource_info, converter=converter) self.assertEquals(u'foo bar', resource_info.text)
def test_set_metadata_from_converter_on_resource_info(self): converter = MagicMock() converter.extract_metadata = MagicMock(return_value={'foo': 'bar'}) resource_info = ResourceInfo() self._create_engine(resource_info=resource_info, converter=converter) self.assertEquals({'foo': 'bar'}, resource_info.metadata)
def test_raises_if_redirect(self, request): request.return_value = MockResponse(status_code=301, is_redirect=True) resource_info = ResourceInfo(url_info={'loc': 'http://example.org/'}) fetcher = self._create_fetcher(resource_info) with self.assertRaises(AttemptedRedirect): fetcher.fetch()
def test_extracts_title_from_metadata(self): extractor = TitleExtractor() resource_info = ResourceInfo(metadata={'title': u'value'}, headers={}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'value', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_falls_back_to_http_last_modified(self): extractor = LastModifiedExtractor() resource_info = ResourceInfo( url_info={}, headers={'last-modified': 'Wed, 31 Dec 2014 15:45:30 GMT'}) self.assertEquals(to_utc(datetime(2014, 12, 31, 15, 45, 30)), extractor.extract_value(resource_info))
def test_extracts_creator(self): extractor = CreatorExtractor() resource_info = ResourceInfo(metadata={'creator': 'John Doe'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'John Doe', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_returns_constant_value(self): extractor = ConstantExtractor(42) field = Field('example', extractor) extractor.bind(field) resource_info = ResourceInfo() self.assertEquals(42, extractor.extract_value(resource_info))
def test_uses_default_if_header_not_mapped(self): extractor = HeaderMappingExtractor('pragma', {}, default='DEFAULT') resource_info = ResourceInfo(headers={'pragma': 'no-cache'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals('DEFAULT', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_extracts_url_from_urlinfo(self): extractor = URLExtractor() resource_info = ResourceInfo(url_info={'loc': 'http://example.org'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'http://example.org', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_defaults_to_loc_if_no_target_given(self): extractor = TargetURLExtractor() resource_info = ResourceInfo(url_info={'loc': 'http://example.org'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'http://example.org', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_raises_if_attribute_not_found(self): site = Site('http://example.org') extractor = SiteAttributeExtractor('name') resource_info = ResourceInfo(site=site) with self.assertRaises(NoValueExtracted): extractor.extract_value(resource_info)
def test_defaults_to_index_html_for_empty_basename(self): extractor = SlugExtractor() resource_info = ResourceInfo(url_info={'loc': 'http://example.org/'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'index-html', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_extracts_description(self): extractor = DescriptionExtractor() resource_info = ResourceInfo(metadata={'description': 'value'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'value', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_raises_if_not_200_ok(self, request): request.return_value = MockResponse(status_code=404) resource_info = ResourceInfo(url_info={'loc': 'http://example.org/'}) fetcher = self._create_fetcher(resource_info) with self.assertRaises(FetchingError): fetcher.fetch()
def test_lastmod_from_urlinfo(self): extractor = LastModifiedExtractor() resource_info = ResourceInfo( url_info={'lastmod': '2014-12-31T16:45:30+01:00'}) self.assertEquals(to_utc(datetime(2014, 12, 31, 15, 45, 30)), extractor.extract_value(resource_info))
def test_applies_urlinfo_extractors_to_urlinfo(self): field = Field('EXAMPLE', extractor=ExampleURLInfoExtractor()) resource_info = ResourceInfo(url_info={'loc': 'http://example.org'}) engine = self._create_engine(resource_info=resource_info, fields=[field]) self.assertEquals({'EXAMPLE': u'http://example.org'}, engine.extract_field_values())
def test_deals_with_non_ascii_characters_unicode(self): extractor = SlugExtractor() resource_info = ResourceInfo( url_info={'loc': u'http://example.org/b\xe4rengraben'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'barengraben', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_deals_with_url_encoding(self): extractor = SlugExtractor() resource_info = ResourceInfo( url_info={'loc': 'http://example.org/foo%%20bar'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'foo-bar', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_deals_with_trailing_slash(self): extractor = SlugExtractor() resource_info = ResourceInfo( url_info={'loc': 'http://example.org/foo/bar/'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'bar', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_builds_uid_based_on_url(self): extractor = UIDExtractor() resource_info = ResourceInfo(url_info={'loc': 'http://example.org'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'dab521de-65f9-250b-4cca-7383feef67dc', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_uses_default_if_header_not_found(self): extractor = HeaderMappingExtractor('content-type', {}, default='DEFAULT') resource_info = ResourceInfo(headers={}) extracted_value = extractor.extract_value(resource_info) self.assertEquals('DEFAULT', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_applies_http_header_extractors_to_headers(self): field = Field('EXAMPLE', extractor=ExampleHTTPHeaderExtractor('example-header')) resource_info = ResourceInfo(headers={'example-header': 'value'}) engine = self._create_engine(fields=[field], resource_info=resource_info) self.assertEquals({'EXAMPLE': u'value'}, engine.extract_field_values())
def test_equals_basename_for_simple_urls(self): extractor = SlugExtractor() resource_info = ResourceInfo( url_info={'loc': 'http://example.org/foo/bar'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'bar', extracted_value) self.assertIsInstance(extracted_value, unicode)
def _create_resource(self, asset_name): doc_fn = resource_filename('ftw.crawler.tests.assets', asset_name) resource_info = ResourceInfo(metadata={}, url_info={'loc': 'http//example.org'}, headers={}, filename=doc_fn, content_type='text/html') return resource_info
def test_extracts_whitespace_separated_keywords(self): extractor = KeywordsExtractor() resource_info = ResourceInfo(metadata={'keywords': u'Foo Bar Baz'}) extracted_value = extractor.extract_value(resource_info) self.assertEquals([u'Foo', u'Bar', u'Baz'], extracted_value) for item in extracted_value: self.assertIsInstance(item, unicode)
def test_returns_http_headers(self, request): request.return_value = MockResponse( content='', headers={'Content-Type': 'text/html'}) resource_info = ResourceInfo(url_info={'loc': 'http://example.org/'}) fetcher = self._create_fetcher(resource_info=resource_info) resource_info = fetcher.fetch() self.assertEquals({'Content-Type': 'text/html'}, resource_info.headers)