Beispiel #1
0
    def test_applies_text_extractors_to_converter_plain_text(self):
        converter = MockConverter(text=u'foo bar')
        field = Field('EXAMPLE', extractor=ExampleTextExtractor())
        engine = self._create_engine(fields=[field], converter=converter)

        self.assertEquals({'EXAMPLE': u'foo bar'},
                          engine.extract_field_values())
Beispiel #2
0
    def test_returns_constant_value(self):
        extractor = ConstantExtractor(42)
        field = Field('example', extractor)
        extractor.bind(field)
        resource_info = ResourceInfo()

        self.assertEquals(42, extractor.extract_value(resource_info))
Beispiel #3
0
    def test_asserts_proper_type_for_multivalued_extractors(self):
        field = Field('int_field',
                      extractor=ConstantExtractor([42]),
                      type_=int,
                      multivalued=True)
        engine = self._create_engine(fields=[field])

        self.assertEquals({'int_field': [42]}, engine.extract_field_values())
Beispiel #4
0
    def test_applies_urlinfo_extractors_to_urlinfo(self):
        field = Field('EXAMPLE', extractor=ExampleURLInfoExtractor())
        resource_info = ResourceInfo(url_info={'loc': 'http://example.org'})
        engine = self._create_engine(resource_info=resource_info,
                                     fields=[field])

        self.assertEquals({'EXAMPLE': u'http://example.org'},
                          engine.extract_field_values())
Beispiel #5
0
    def test_asserts_proper_type_for_extractors(self):
        field = Field('int_field',
                      extractor=ConstantExtractor('foo'),
                      type_=int)
        engine = self._create_engine(fields=[field])

        with self.assertRaises(ExtractionError):
            engine.extract_field_values()
Beispiel #6
0
    def test_applies_http_header_extractors_to_headers(self):
        field = Field('EXAMPLE',
                      extractor=ExampleHTTPHeaderExtractor('example-header'))
        resource_info = ResourceInfo(headers={'example-header': 'value'})
        engine = self._create_engine(fields=[field],
                                     resource_info=resource_info)

        self.assertEquals({'EXAMPLE': u'value'}, engine.extract_field_values())
Beispiel #7
0
    def test_applies_site_config_extractors_to_site(self):
        field = Field('EXAMPLE', extractor=SiteAttributeExtractor('name'))
        site = Site('http://example.org', attributes={'name': 'My Site'})
        resource_info = ResourceInfo(site=site)
        engine = self._create_engine(fields=[field],
                                     resource_info=resource_info)

        self.assertEquals({'EXAMPLE': u'My Site'},
                          engine.extract_field_values())
Beispiel #8
0
    def test_returns_unicode_for_string_constant(self):
        extractor = ConstantExtractor('foo')
        field = Field('example', extractor)
        extractor.bind(field)
        resource_info = ResourceInfo()
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'foo', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
Beispiel #9
0
    def test_skips_field_if_no_value_extracted_and_field_not_required(self):
        converter = MagicMock()
        converter.extract_metadata = MagicMock(return_value={})

        field = Field('optional_field',
                      extractor=ExampleMetadataExtractor(),
                      type_=unicode)
        engine = self._create_engine(fields=[field], converter=converter)

        self.assertEquals({}, engine.extract_field_values())
Beispiel #10
0
    def test_returns_unicode_for_multivalued_string_constant(self):
        extractor = ConstantExtractor(['foo', 'bar'])
        field = Field('example', extractor, multivalued=True)
        extractor.bind(field)
        resource_info = ResourceInfo()
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals([u'foo', u'bar'], extracted_value)
        for item in extracted_value:
            self.assertIsInstance(item, unicode)
Beispiel #11
0
    def test_provides_default_for_required_datetime_fields(self):
        converter = MagicMock()
        converter.extract_metadata = MagicMock(return_value={})

        field = Field('required_datetime',
                      extractor=ExampleMetadataExtractor(),
                      type_=datetime,
                      required=True)
        engine = self._create_engine(fields=[field], converter=converter)

        self.assertEquals({'required_datetime': datetime(1970, 1, 1, 0, 0)},
                          engine.extract_field_values())
Beispiel #12
0
    def setUp(self):
        CrawlerTestCase.setUp(self)
        # TODO: Refactor this testcase
        site = Site('http://example.org')
        self.resource_info = ResourceInfo()
        self.mapping = {'travel': 'TRAVEL', 'music': 'MUSIC'}

        subcategory = Field('subcategory',
                            extractor=ConstantExtractor('travel'))

        category = Field('category',
                         extractor=FieldMappingExtractor(
                             'subcategory', self.mapping))

        self.config = Config(
            sites=[site],
            tika=None,
            solr=None,
            unique_field=None,
            url_field=None,
            last_modified_field=None,
            fields=[category, subcategory],
        )
    def setUp(self):
        CrawlerTestCase.setUp(self)
        self.site = Site('http://example.org')
        self.tika = 'http://localhost:9998'
        self.solr = 'http://localhost:8983/solr'
        self.slacktoken = 'token'
        self.slackchannel = '#channel'
        self.unique_field = 'UID'
        self.url_field = 'url'
        self.last_modified_field = 'modified'
        self.field = Field('foo', extractor=Extractor())

        self.config = Config([self.site], self.unique_field, self.url_field,
                             self.last_modified_field, [self.field], self.tika,
                             self.solr, self.slacktoken, self.slackchannel)
Beispiel #14
0
 sites=[
     Site('https://www.sportamt-bern.ch/',
          attributes={'site_area': 'Sportamt Bern'}),
     Site('http://www.sitemapxml.co.uk/',
          attributes={'site_area': 'Sitemap XML'}),
     Site('http://www.pctipp.ch/', attributes={'site_area': 'PCtipp'}),
     Site('http://mailchimp.com', attributes={'site_area': 'MailChimp'}),
     Site('https://bgs.zg.ch', attributes={'site_area':
                                           'Gesetzessammlung'}),
 ],
 unique_field='UID',
 url_field='path_string',
 last_modified_field='modified',
 fields=[
     Field('allowedRolesAndUsers',
           extractor=ConstantExtractor(['Anonymous']),
           multivalued=True),
     Field('created', extractor=LastModifiedExtractor(), type_=datetime),
     Field('Creator', extractor=CreatorExtractor()),
     Field('Description', extractor=DescriptionExtractor()),
     Field('effective', extractor=IndexingTimeExtractor(), type_=datetime),
     Field('expires',
           extractor=ConstantExtractor(datetime(2050, 12, 31)),
           type_=datetime),
     Field('getId', extractor=SlugExtractor()),
     Field('getRemoteUrl', extractor=TargetURLExtractor()),
     Field('modified', extractor=LastModifiedExtractor(), type_=datetime),
     Field('object_type',
           extractor=FieldMappingExtractor('portal_type',
                                           OBJECT_TYPE_MAPPING,
                                           default='File')),
 def test_field_stores_required(self):
     field = Field(self.name, self.extractor, self.type_, required=True)
     self.assertEquals(True, field.required)
 def test_field_binds_extractor_to_self(self):
     extractor = Extractor()
     field = Field(self.name, extractor, self.type_, required=True)
     self.assertEquals(field, extractor.field)
 def test_field_stores_name(self):
     field = Field(self.name, self.extractor)
     self.assertEquals(self.name, field.name)
Beispiel #18
0
    def test_applies_metadata_extractors_to_converter_metadata(self):
        converter = MockConverter({'example': u'value', 'other': u'data'})
        field = Field('EXAMPLE', extractor=ExampleMetadataExtractor())
        engine = self._create_engine(fields=[field], converter=converter)

        self.assertEquals({'EXAMPLE': u'value'}, engine.extract_field_values())
 def test_field_stores_type(self):
     field = Field(self.name, self.extractor, self.type_)
     self.assertEquals(self.type_, field.type_)
Beispiel #20
0
    def test_raises_type_error_for_unknown_extractor_type(self):
        field = Field('foo', extractor=Extractor())
        engine = self._create_engine(fields=[field])

        with self.assertRaises(ExtractionError):
            engine.extract_field_values()