def test_applies_text_extractors_to_converter_plain_text(self): converter = MockConverter(text=u'foo bar') field = Field('EXAMPLE', extractor=ExampleTextExtractor()) engine = self._create_engine(fields=[field], converter=converter) self.assertEquals({'EXAMPLE': u'foo bar'}, engine.extract_field_values())
def test_returns_constant_value(self): extractor = ConstantExtractor(42) field = Field('example', extractor) extractor.bind(field) resource_info = ResourceInfo() self.assertEquals(42, extractor.extract_value(resource_info))
def test_asserts_proper_type_for_multivalued_extractors(self): field = Field('int_field', extractor=ConstantExtractor([42]), type_=int, multivalued=True) engine = self._create_engine(fields=[field]) self.assertEquals({'int_field': [42]}, engine.extract_field_values())
def test_applies_urlinfo_extractors_to_urlinfo(self): field = Field('EXAMPLE', extractor=ExampleURLInfoExtractor()) resource_info = ResourceInfo(url_info={'loc': 'http://example.org'}) engine = self._create_engine(resource_info=resource_info, fields=[field]) self.assertEquals({'EXAMPLE': u'http://example.org'}, engine.extract_field_values())
def test_asserts_proper_type_for_extractors(self): field = Field('int_field', extractor=ConstantExtractor('foo'), type_=int) engine = self._create_engine(fields=[field]) with self.assertRaises(ExtractionError): engine.extract_field_values()
def test_applies_http_header_extractors_to_headers(self): field = Field('EXAMPLE', extractor=ExampleHTTPHeaderExtractor('example-header')) resource_info = ResourceInfo(headers={'example-header': 'value'}) engine = self._create_engine(fields=[field], resource_info=resource_info) self.assertEquals({'EXAMPLE': u'value'}, engine.extract_field_values())
def test_applies_site_config_extractors_to_site(self): field = Field('EXAMPLE', extractor=SiteAttributeExtractor('name')) site = Site('http://example.org', attributes={'name': 'My Site'}) resource_info = ResourceInfo(site=site) engine = self._create_engine(fields=[field], resource_info=resource_info) self.assertEquals({'EXAMPLE': u'My Site'}, engine.extract_field_values())
def test_returns_unicode_for_string_constant(self): extractor = ConstantExtractor('foo') field = Field('example', extractor) extractor.bind(field) resource_info = ResourceInfo() extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'foo', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_skips_field_if_no_value_extracted_and_field_not_required(self): converter = MagicMock() converter.extract_metadata = MagicMock(return_value={}) field = Field('optional_field', extractor=ExampleMetadataExtractor(), type_=unicode) engine = self._create_engine(fields=[field], converter=converter) self.assertEquals({}, engine.extract_field_values())
def test_returns_unicode_for_multivalued_string_constant(self): extractor = ConstantExtractor(['foo', 'bar']) field = Field('example', extractor, multivalued=True) extractor.bind(field) resource_info = ResourceInfo() extracted_value = extractor.extract_value(resource_info) self.assertEquals([u'foo', u'bar'], extracted_value) for item in extracted_value: self.assertIsInstance(item, unicode)
def test_provides_default_for_required_datetime_fields(self): converter = MagicMock() converter.extract_metadata = MagicMock(return_value={}) field = Field('required_datetime', extractor=ExampleMetadataExtractor(), type_=datetime, required=True) engine = self._create_engine(fields=[field], converter=converter) self.assertEquals({'required_datetime': datetime(1970, 1, 1, 0, 0)}, engine.extract_field_values())
def setUp(self): CrawlerTestCase.setUp(self) # TODO: Refactor this testcase site = Site('http://example.org') self.resource_info = ResourceInfo() self.mapping = {'travel': 'TRAVEL', 'music': 'MUSIC'} subcategory = Field('subcategory', extractor=ConstantExtractor('travel')) category = Field('category', extractor=FieldMappingExtractor( 'subcategory', self.mapping)) self.config = Config( sites=[site], tika=None, solr=None, unique_field=None, url_field=None, last_modified_field=None, fields=[category, subcategory], )
def setUp(self): CrawlerTestCase.setUp(self) self.site = Site('http://example.org') self.tika = 'http://localhost:9998' self.solr = 'http://localhost:8983/solr' self.slacktoken = 'token' self.slackchannel = '#channel' self.unique_field = 'UID' self.url_field = 'url' self.last_modified_field = 'modified' self.field = Field('foo', extractor=Extractor()) self.config = Config([self.site], self.unique_field, self.url_field, self.last_modified_field, [self.field], self.tika, self.solr, self.slacktoken, self.slackchannel)
sites=[ Site('https://www.sportamt-bern.ch/', attributes={'site_area': 'Sportamt Bern'}), Site('http://www.sitemapxml.co.uk/', attributes={'site_area': 'Sitemap XML'}), Site('http://www.pctipp.ch/', attributes={'site_area': 'PCtipp'}), Site('http://mailchimp.com', attributes={'site_area': 'MailChimp'}), Site('https://bgs.zg.ch', attributes={'site_area': 'Gesetzessammlung'}), ], unique_field='UID', url_field='path_string', last_modified_field='modified', fields=[ Field('allowedRolesAndUsers', extractor=ConstantExtractor(['Anonymous']), multivalued=True), Field('created', extractor=LastModifiedExtractor(), type_=datetime), Field('Creator', extractor=CreatorExtractor()), Field('Description', extractor=DescriptionExtractor()), Field('effective', extractor=IndexingTimeExtractor(), type_=datetime), Field('expires', extractor=ConstantExtractor(datetime(2050, 12, 31)), type_=datetime), Field('getId', extractor=SlugExtractor()), Field('getRemoteUrl', extractor=TargetURLExtractor()), Field('modified', extractor=LastModifiedExtractor(), type_=datetime), Field('object_type', extractor=FieldMappingExtractor('portal_type', OBJECT_TYPE_MAPPING, default='File')),
def test_field_stores_required(self): field = Field(self.name, self.extractor, self.type_, required=True) self.assertEquals(True, field.required)
def test_field_binds_extractor_to_self(self): extractor = Extractor() field = Field(self.name, extractor, self.type_, required=True) self.assertEquals(field, extractor.field)
def test_field_stores_name(self): field = Field(self.name, self.extractor) self.assertEquals(self.name, field.name)
def test_applies_metadata_extractors_to_converter_metadata(self): converter = MockConverter({'example': u'value', 'other': u'data'}) field = Field('EXAMPLE', extractor=ExampleMetadataExtractor()) engine = self._create_engine(fields=[field], converter=converter) self.assertEquals({'EXAMPLE': u'value'}, engine.extract_field_values())
def test_field_stores_type(self): field = Field(self.name, self.extractor, self.type_) self.assertEquals(self.type_, field.type_)
def test_raises_type_error_for_unknown_extractor_type(self): field = Field('foo', extractor=Extractor()) engine = self._create_engine(fields=[field]) with self.assertRaises(ExtractionError): engine.extract_field_values()