Exemple #1
0
 def test_no_locations(self):
     '''
     Test that the extractor works with an empty list of locations.
     '''
     extractor = geoextract.NameExtractor()
     pipeline = geoextract.Pipeline([], extractors=[extractor])
     assert pipeline.extract('foobar') == []
Exemple #2
0
 def test_app_creation(self):
     '''
     Test creating a web app from a pipeline.
     '''
     pipeline = geoextract.Pipeline([])
     app = pipeline.create_app()
     assert hasattr(app, 'run')
Exemple #3
0
 def test_no_validation(self):
     '''
     Test disabled validation.
     '''
     extractor = FakeExtractor([(0, 1, {'name': 'a'})])
     pipeline = geoextract.Pipeline([], extractors=[extractor],
                                    validator=False)
     assert pipeline.extract('does not matter') == [{'name': 'a'}]
Exemple #4
0
 def test_extractors(self):
     '''
     Test that extractors are called correctly.
     '''
     extractor1 = FakeExtractor([(0, 1, {'name': 'foo'})])
     extractor2 = FakeExtractor([(1, 1, {'name': 'bar'})])
     pipeline = geoextract.Pipeline([], extractors=[extractor1, extractor2])
     results = pipeline.extract('does not matter')
     assert sorted(r['name'] for r in results) == ['bar', 'foo']
Exemple #5
0
 def test_locations(self):
     '''
     Test that locations are correctly converted to a dict.
     '''
     pipeline = geoextract.Pipeline([location1, location2])
     locations = pipeline.locations
     assert locations['foo'] is location1
     assert locations['bar'] is location2
     assert len(locations) == 2
Exemple #6
0
 def test_no_splitting(self):
     '''
     Test disabled splitting.
     '''
     extractor = mock.Mock()
     extractor.extract = mock.Mock()
     extractor.extract.return_value = []
     pipeline = geoextract.Pipeline([], extractors=[extractor],
                                    splitter=False)
     pipeline.extract('white   space')
     extractor.extract.assert_called_once_with('white space')
Exemple #7
0
 def test_normalized_extractor_input(self):
     '''
     Test that extractor input is normalized.
     '''
     extractor = mock.Mock()
     extractor.extract = mock.Mock()
     extractor.extract.return_value = []
     pipeline = geoextract.Pipeline([], extractors=[extractor],
                                    normalizer=UpperNormalizer())
     pipeline.extract('foo')
     extractor.extract.assert_called_once_with('FOO')
Exemple #8
0
 def test_normalized_names(self):
     '''
     Test that location names are correctly normalized.
     '''
     pipeline = geoextract.Pipeline([location1, location2],
                                    normalizer=UpperNormalizer())
     names = pipeline.normalized_names
     assert names['FOO'] is location1
     assert names['BAR'] is location2
     assert names['BAZINGA'] is location2
     assert len(names) == 3
Exemple #9
0
 def test_no_normalizer(self):
     '''
     Test disabled normalization.
     '''
     extractor = mock.Mock()
     extractor.extract = mock.Mock()
     extractor.extract.return_value = [(0, 1, {'name': 'A  B'})]
     pipeline = geoextract.Pipeline([{'name': 'A  B'}],
                                    extractors=[extractor],
                                    normalizer=False)
     results = pipeline.extract('NO_NORMALIZATION--')
     extractor.extract.assert_called_once_with('NO_NORMALIZATION--')
     assert results == [{'name': 'A  B'}]
Exemple #10
0
    def test_splitting(self):
        '''
        Test splitting of documents.
        '''
        class MockSplitter(geoextract.Splitter):
            def split(self, s):
                return s

        extractor = mock.Mock()
        extractor.extract = mock.Mock()
        extractor.extract.return_value = []
        pipeline = geoextract.Pipeline([], extractors=[extractor],
                                       splitter=MockSplitter())
        pipeline.extract('foo')
        extractor.extract.assert_has_calls(
            [mock.call('f'), mock.call('o'), mock.call('o')]
        )
Exemple #11
0
    def test_validation(self):
        '''
        Test validation of results.
        '''
        class MockValidator(geoextract.Validator):
            def validate(self, location):
                return location['name'] == 'a'

        extractor = FakeExtractor([
            (0, 1, {'name': 'a'}),
            (1, 1, {'name': 'b'}),
        ])
        pipeline = geoextract.Pipeline([], extractors=[extractor],
                                       validator=MockValidator())
        results = pipeline.extract('does not matter')
        assert len(results) == 1
        assert results[0]['name'] == 'a'
Exemple #12
0
 def test_name_denormalization(self):
     '''
     Test that names in results are denormalized.
     '''
     locations = [
         {'name': 'a-street'},
         {'name': 'a-city'},
         {'name': 'a-name'},
     ]
     normalizer = UpperNormalizer()
     result = (0, 0, {'name': 'A-NAME', 'street': 'A-STREET',
               'city': 'A-CITY'})
     pipeline = geoextract.Pipeline(locations, normalizer=normalizer,
                                    extractors=[FakeExtractor([result])])
     extracted = pipeline.extract('does not matter')
     assert extracted[0]['name'] == 'a-name'
     assert extracted[0]['street'] == 'a-street'
     assert extracted[0]['city'] == 'a-city'
Exemple #13
0
    def test_postprocessing(self):
        '''
        Test postprocessing of results.
        '''
        class MockPostprocessor(geoextract.Postprocessor):
            def postprocess(self, location):
                if location['name'] == 'a':
                    location['foo'] = 'bar'
                    return location
                else:
                    return False

        extractor = FakeExtractor([
            (0, 1, {'name': 'a'}),
            (1, 1, {'name': 'b'}),
        ])
        pipeline = geoextract.Pipeline([], extractors=[extractor],
                                       postprocessors=[MockPostprocessor()])
        results = pipeline.extract('does not matter')
        assert len(results) == 1
        assert results[0] == {'name': 'a', 'foo': 'bar'}
Exemple #14
0
 def test_component_setup(self):
     '''
     Test that components are setup correctly.
     '''
     normalizer = mock.Mock()
     extractor1 = mock.Mock()
     extractor2 = mock.Mock()
     validator = mock.Mock()
     splitter = mock.Mock()
     postprocessor1 = mock.Mock()
     postprocessor2 = mock.Mock()
     geoextract.Pipeline([], extractors=[extractor1, extractor2],
                         validator=validator, normalizer=normalizer,
                         splitter=splitter, postprocessors=[postprocessor1,
                         postprocessor2])
     assert normalizer.setup.called
     assert extractor1.setup.called
     assert extractor2.setup.called
     assert validator.setup.called
     assert splitter.setup.called
     assert postprocessor1.setup.called
     assert postprocessor2.setup.called
Exemple #15
0
 def test_duplicate_removal(self):
     '''
     Test removal of duplicate results.
     '''
     keys = ['street', 'house_number', 'postcode', 'city']
     for subkeys in subsets(keys):
         subkeys.append('name')
         loc1 = {subkey: subkey for subkey in subkeys}
         loc2 = loc1.copy()  # Equal to loc1
         loc3 = loc1.copy()
         loc3['foo'] = 'bar'  # Equal to loc1 because other keys are ignored
         loc4 = loc1.copy()
         loc4[subkeys[0]] = 'x'  # Not equal
         extractor = FakeExtractor([
             (0, 1, loc1),
             (1, 1, loc2),
             (2, 1, loc3),
             (3, 1, loc4),
         ])
         pipeline = geoextract.Pipeline([], extractors=[extractor])
         results = pipeline.extract('does not matter')
         assert sort_as_json(results) == sort_as_json([loc1, loc4])
Exemple #16
0
 def test_pruning_of_overlapping_results(self):
     '''
     Test that overlapping results are pruned.
     '''
     # a
     #  bb
     #   c
     #   ddd
     #  eeee
     #      ff
     #     gg
     extractor = FakeExtractor([
         (0, 1, {'name': 'a'}),
         (1, 2, {'name': 'b'}),
         (2, 1, {'name': 'c'}),
         (2, 3, {'name': 'd'}),
         (1, 4, {'name': 'e'}),
         (5, 2, {'name': 'f'}),
         (4, 2, {'name': 'g'}),
     ])
     pipeline = geoextract.Pipeline([], extractors=[extractor])
     results = pipeline.extract('does not matter')
     assert sorted(r['name'] for r in results) == ['a', 'e', 'f', 'g']
Exemple #17
0
key_filter_postprocessor = geoextract.KeyFilterPostprocessor(KEYS_TO_KEEP)


#
# PIPELINE CONSTRUCTION
#

# A pipeline connects all the different components.
#
# Here we're using custom extractors and a custom normalizer. We could also
# provide our own code for splitting a document into chunks and for validation,
# but for simplicity we'll use the default implementations in these cases.

pipeline = geoextract.Pipeline(
    locations,
    extractors=[pattern_extractor, name_extractor],
    normalizer=normalizer,
    postprocessors=[key_filter_postprocessor],
)


#
# COMMAND LINE INTERFACE
#

# This example can be used to either extract locations from a file specified on
# the command line or (if no additional argument is given) to start a web
# server which provides location extraction as a web service.

if __name__ == '__main__':
    import io
    import json
Exemple #18
0
def extract_found_locations(text, bodies=None):
    """
    :type text: str
    :type bodies: list of Body
    :return: list
    """
    search_for = create_geoextract_data(bodies)

    #
    # STRING NORMALIZATION
    #

    # Strings must be normalized before searching and matching them. This includes
    # technical normalization (e.g. Unicode normalization), linguistic
    # normalization (e.g. stemming) and content normalization (e.g. synonym
    # handling).

    normalizer = geoextract.BasicNormalizer(subs=[(r'str\b', 'strasse')],
                                            stem='german')

    #
    # NAMES
    #

    # Many places can be referred to using just their name, for example specific
    # buildings (e.g. the Brandenburger Tor), streets (Hauptstraße) or other
    # points of interest. These can be extracted using the ``NameExtractor``.
    #
    # Note that extractor will automatically receive the (normalized) location
    # names from the pipeline we construct later, so there's no need to explicitly
    # pass them to the constructor.

    name_extractor = geoextract.NameExtractor()

    #
    # PATTERNS
    #

    # For locations that are notated using a semi-structured format (addresses)
    # the ``PatternExtractor`` is a good choice. It looks for matches of regular
    # expressions.
    #
    # The patterns should have named groups, their sub-matches will be
    # returned in the extracted locations.

    address_pattern = re.compile(r'''
        (?P<street>[^\W\d_](?:[^\W\d_]|\s)*[^\W\d_])
        \s+
        (?P<house_number>([1-9]\d*)[\w-]*)
        (
            \s+
            (
                (?P<postcode>\d{5})
                \s+
            )?
            (?P<city>([^\W\d_]|-)+)
        )?
    ''',
                                 flags=re.UNICODE | re.VERBOSE)

    pattern_extractor = geoextract.PatternExtractor([address_pattern])

    #
    # POSTPROCESSING
    #

    # Once locations are extracted you might want to postprocess them, for example
    # to remove certain attributes that are useful for validation but are not
    # intended for publication. Or you may want to remove a certain address that's
    # printed in the footer of all the documents you're processing.
    #
    # GeoExtract allows you to do this by using one or more postprocessors. In this
    # example we will remove all but a few keys from our location dicts.

    keys_to_keep = ['name', 'street', 'house_number', 'postcode', 'city']
    key_filter_postprocessor = geoextract.KeyFilterPostprocessor(keys_to_keep)

    #
    # PIPELINE CONSTRUCTION
    #

    # A pipeline connects all the different components.
    #
    # Here we're using custom extractors and a custom normalizer. We could also
    # provide our own code for splitting a document into chunks and for validation,
    # but for simplicity we'll use the default implementations in these cases.

    pipeline = geoextract.Pipeline(
        search_for,
        extractors=[pattern_extractor, name_extractor],
        normalizer=normalizer,
        postprocessors=[key_filter_postprocessor],
    )

    return pipeline.extract(text)