def test_no_locations(self): ''' Test that the extractor works with an empty list of locations. ''' extractor = geoextract.NameExtractor() pipeline = geoextract.Pipeline([], extractors=[extractor]) assert pipeline.extract('foobar') == []
def test_app_creation(self): ''' Test creating a web app from a pipeline. ''' pipeline = geoextract.Pipeline([]) app = pipeline.create_app() assert hasattr(app, 'run')
def test_no_validation(self): ''' Test disabled validation. ''' extractor = FakeExtractor([(0, 1, {'name': 'a'})]) pipeline = geoextract.Pipeline([], extractors=[extractor], validator=False) assert pipeline.extract('does not matter') == [{'name': 'a'}]
def test_extractors(self): ''' Test that extractors are called correctly. ''' extractor1 = FakeExtractor([(0, 1, {'name': 'foo'})]) extractor2 = FakeExtractor([(1, 1, {'name': 'bar'})]) pipeline = geoextract.Pipeline([], extractors=[extractor1, extractor2]) results = pipeline.extract('does not matter') assert sorted(r['name'] for r in results) == ['bar', 'foo']
def test_locations(self): ''' Test that locations are correctly converted to a dict. ''' pipeline = geoextract.Pipeline([location1, location2]) locations = pipeline.locations assert locations['foo'] is location1 assert locations['bar'] is location2 assert len(locations) == 2
def test_no_splitting(self): ''' Test disabled splitting. ''' extractor = mock.Mock() extractor.extract = mock.Mock() extractor.extract.return_value = [] pipeline = geoextract.Pipeline([], extractors=[extractor], splitter=False) pipeline.extract('white space') extractor.extract.assert_called_once_with('white space')
def test_normalized_extractor_input(self): ''' Test that extractor input is normalized. ''' extractor = mock.Mock() extractor.extract = mock.Mock() extractor.extract.return_value = [] pipeline = geoextract.Pipeline([], extractors=[extractor], normalizer=UpperNormalizer()) pipeline.extract('foo') extractor.extract.assert_called_once_with('FOO')
def test_normalized_names(self): ''' Test that location names are correctly normalized. ''' pipeline = geoextract.Pipeline([location1, location2], normalizer=UpperNormalizer()) names = pipeline.normalized_names assert names['FOO'] is location1 assert names['BAR'] is location2 assert names['BAZINGA'] is location2 assert len(names) == 3
def test_no_normalizer(self): ''' Test disabled normalization. ''' extractor = mock.Mock() extractor.extract = mock.Mock() extractor.extract.return_value = [(0, 1, {'name': 'A B'})] pipeline = geoextract.Pipeline([{'name': 'A B'}], extractors=[extractor], normalizer=False) results = pipeline.extract('NO_NORMALIZATION--') extractor.extract.assert_called_once_with('NO_NORMALIZATION--') assert results == [{'name': 'A B'}]
def test_splitting(self): ''' Test splitting of documents. ''' class MockSplitter(geoextract.Splitter): def split(self, s): return s extractor = mock.Mock() extractor.extract = mock.Mock() extractor.extract.return_value = [] pipeline = geoextract.Pipeline([], extractors=[extractor], splitter=MockSplitter()) pipeline.extract('foo') extractor.extract.assert_has_calls( [mock.call('f'), mock.call('o'), mock.call('o')] )
def test_validation(self): ''' Test validation of results. ''' class MockValidator(geoextract.Validator): def validate(self, location): return location['name'] == 'a' extractor = FakeExtractor([ (0, 1, {'name': 'a'}), (1, 1, {'name': 'b'}), ]) pipeline = geoextract.Pipeline([], extractors=[extractor], validator=MockValidator()) results = pipeline.extract('does not matter') assert len(results) == 1 assert results[0]['name'] == 'a'
def test_name_denormalization(self): ''' Test that names in results are denormalized. ''' locations = [ {'name': 'a-street'}, {'name': 'a-city'}, {'name': 'a-name'}, ] normalizer = UpperNormalizer() result = (0, 0, {'name': 'A-NAME', 'street': 'A-STREET', 'city': 'A-CITY'}) pipeline = geoextract.Pipeline(locations, normalizer=normalizer, extractors=[FakeExtractor([result])]) extracted = pipeline.extract('does not matter') assert extracted[0]['name'] == 'a-name' assert extracted[0]['street'] == 'a-street' assert extracted[0]['city'] == 'a-city'
def test_postprocessing(self): ''' Test postprocessing of results. ''' class MockPostprocessor(geoextract.Postprocessor): def postprocess(self, location): if location['name'] == 'a': location['foo'] = 'bar' return location else: return False extractor = FakeExtractor([ (0, 1, {'name': 'a'}), (1, 1, {'name': 'b'}), ]) pipeline = geoextract.Pipeline([], extractors=[extractor], postprocessors=[MockPostprocessor()]) results = pipeline.extract('does not matter') assert len(results) == 1 assert results[0] == {'name': 'a', 'foo': 'bar'}
def test_component_setup(self): ''' Test that components are setup correctly. ''' normalizer = mock.Mock() extractor1 = mock.Mock() extractor2 = mock.Mock() validator = mock.Mock() splitter = mock.Mock() postprocessor1 = mock.Mock() postprocessor2 = mock.Mock() geoextract.Pipeline([], extractors=[extractor1, extractor2], validator=validator, normalizer=normalizer, splitter=splitter, postprocessors=[postprocessor1, postprocessor2]) assert normalizer.setup.called assert extractor1.setup.called assert extractor2.setup.called assert validator.setup.called assert splitter.setup.called assert postprocessor1.setup.called assert postprocessor2.setup.called
def test_duplicate_removal(self): ''' Test removal of duplicate results. ''' keys = ['street', 'house_number', 'postcode', 'city'] for subkeys in subsets(keys): subkeys.append('name') loc1 = {subkey: subkey for subkey in subkeys} loc2 = loc1.copy() # Equal to loc1 loc3 = loc1.copy() loc3['foo'] = 'bar' # Equal to loc1 because other keys are ignored loc4 = loc1.copy() loc4[subkeys[0]] = 'x' # Not equal extractor = FakeExtractor([ (0, 1, loc1), (1, 1, loc2), (2, 1, loc3), (3, 1, loc4), ]) pipeline = geoextract.Pipeline([], extractors=[extractor]) results = pipeline.extract('does not matter') assert sort_as_json(results) == sort_as_json([loc1, loc4])
def test_pruning_of_overlapping_results(self): ''' Test that overlapping results are pruned. ''' # a # bb # c # ddd # eeee # ff # gg extractor = FakeExtractor([ (0, 1, {'name': 'a'}), (1, 2, {'name': 'b'}), (2, 1, {'name': 'c'}), (2, 3, {'name': 'd'}), (1, 4, {'name': 'e'}), (5, 2, {'name': 'f'}), (4, 2, {'name': 'g'}), ]) pipeline = geoextract.Pipeline([], extractors=[extractor]) results = pipeline.extract('does not matter') assert sorted(r['name'] for r in results) == ['a', 'e', 'f', 'g']
key_filter_postprocessor = geoextract.KeyFilterPostprocessor(KEYS_TO_KEEP) # # PIPELINE CONSTRUCTION # # A pipeline connects all the different components. # # Here we're using custom extractors and a custom normalizer. We could also # provide our own code for splitting a document into chunks and for validation, # but for simplicity we'll use the default implementations in these cases. pipeline = geoextract.Pipeline( locations, extractors=[pattern_extractor, name_extractor], normalizer=normalizer, postprocessors=[key_filter_postprocessor], ) # # COMMAND LINE INTERFACE # # This example can be used to either extract locations from a file specified on # the command line or (if no additional argument is given) to start a web # server which provides location extraction as a web service. if __name__ == '__main__': import io import json
def extract_found_locations(text, bodies=None): """ :type text: str :type bodies: list of Body :return: list """ search_for = create_geoextract_data(bodies) # # STRING NORMALIZATION # # Strings must be normalized before searching and matching them. This includes # technical normalization (e.g. Unicode normalization), linguistic # normalization (e.g. stemming) and content normalization (e.g. synonym # handling). normalizer = geoextract.BasicNormalizer(subs=[(r'str\b', 'strasse')], stem='german') # # NAMES # # Many places can be referred to using just their name, for example specific # buildings (e.g. the Brandenburger Tor), streets (Hauptstraße) or other # points of interest. These can be extracted using the ``NameExtractor``. # # Note that extractor will automatically receive the (normalized) location # names from the pipeline we construct later, so there's no need to explicitly # pass them to the constructor. name_extractor = geoextract.NameExtractor() # # PATTERNS # # For locations that are notated using a semi-structured format (addresses) # the ``PatternExtractor`` is a good choice. It looks for matches of regular # expressions. # # The patterns should have named groups, their sub-matches will be # returned in the extracted locations. address_pattern = re.compile(r''' (?P<street>[^\W\d_](?:[^\W\d_]|\s)*[^\W\d_]) \s+ (?P<house_number>([1-9]\d*)[\w-]*) ( \s+ ( (?P<postcode>\d{5}) \s+ )? (?P<city>([^\W\d_]|-)+) )? ''', flags=re.UNICODE | re.VERBOSE) pattern_extractor = geoextract.PatternExtractor([address_pattern]) # # POSTPROCESSING # # Once locations are extracted you might want to postprocess them, for example # to remove certain attributes that are useful for validation but are not # intended for publication. Or you may want to remove a certain address that's # printed in the footer of all the documents you're processing. # # GeoExtract allows you to do this by using one or more postprocessors. In this # example we will remove all but a few keys from our location dicts. keys_to_keep = ['name', 'street', 'house_number', 'postcode', 'city'] key_filter_postprocessor = geoextract.KeyFilterPostprocessor(keys_to_keep) # # PIPELINE CONSTRUCTION # # A pipeline connects all the different components. # # Here we're using custom extractors and a custom normalizer. We could also # provide our own code for splitting a document into chunks and for validation, # but for simplicity we'll use the default implementations in these cases. pipeline = geoextract.Pipeline( search_for, extractors=[pattern_extractor, name_extractor], normalizer=normalizer, postprocessors=[key_filter_postprocessor], ) return pipeline.extract(text)