コード例 #1
0
 def test_only_building_name(self):
     assert parser.tag('5C') == OrderedDict([('BuildingName', '5C')])
     assert parser.tag('Victorian House') == OrderedDict([
         ('BuildingName', 'Victorian House')
     ])
     assert parser.tag('SHAKESPEARE HOUSE') == OrderedDict([
         ('BuildingName', 'SHAKESPEARE HOUSE')
     ])
コード例 #2
0
 def test_only_sub_building_name(self):
     assert parser.tag('Flat 5') == OrderedDict([('SubBuildingName',
                                                  'Flat 5')])
     assert parser.tag('Apartment 1C') == OrderedDict([('SubBuildingName',
                                                        'Apartment 1C')])
     assert parser.tag('Unit A') == OrderedDict([('SubBuildingName',
                                                  'Unit A')])
     assert parser.tag('Unit C3') == OrderedDict([('SubBuildingName',
                                                   'Unit C3')])
コード例 #3
0
 def test_only_town(self):
     assert parser.parse('Oxford') == [('Oxford', 'TownName')]
     assert parser.tag('STOKE-ON-TRENT') == OrderedDict([
         ('TownName', 'STOKE-ON-TRENT')
     ])
     assert parser.tag('ABERTAWE') == OrderedDict([('TownName', 'ABERTAWE')
                                                   ])
     assert parser.tag('CHESTER LE STREET') == OrderedDict([
         ('TownName', 'CHESTER LE STREET')
     ])
     assert parser.tag('CASNEWYDD') == OrderedDict([('TownName',
                                                     'CASNEWYDD')])
コード例 #4
0
 def test_only_postcode(self):
     assert parser.parse('RH1 2FW') == [('RH1', 'Postcode'),
                                        ('2FW', 'Postcode')]
     assert parser.parse('RH12FW') == [('RH12FW', 'Postcode')]
     assert parser.parse('L1 1XX') == [('L1', 'Postcode'),
                                       ('1XX', 'Postcode')]
     assert parser.parse('KT18') == [('KT18', 'Postcode')]
     assert parser.parse('SW1P') == [('SW1P', 'Postcode')]
     assert parser.tag('WF11 9ZZ') == OrderedDict([('Postcode', 'WF11 9ZZ')
                                                   ])
     assert parser.tag('EC1N 8QX') == OrderedDict([('Postcode', 'EC1N 8QX')
                                                   ])
     assert parser.tag('EC1N8QX') == OrderedDict([('Postcode', 'EC1N8QX')])
     assert parser.tag('SY23 3SR') == OrderedDict([('Postcode', 'SY23 3SR')
                                                   ])
     assert parser.tag('SY233SR') == OrderedDict([('Postcode', 'SY233SR')])
コード例 #5
0
 def test_only_street_name(self):
     assert parser.tag('Oxford Road') == OrderedDict([('StreetName',
                                                       'Oxford Road')])
     assert parser.tag('Regent Street') == OrderedDict([('StreetName',
                                                         'Regent Street')])
     assert parser.tag('NORFOLK DRIVE') == OrderedDict([('StreetName',
                                                         'NORFOLK DRIVE')])
     assert parser.tag('LONDON ROAD') == OrderedDict([('StreetName',
                                                       'LONDON ROAD')])
     assert parser.tag('ST. JAMES STREET') == OrderedDict([
         ('StreetName', 'ST. JAMES STREET')
     ])
     assert parser.tag('ST. ALBANS STREET') == OrderedDict([
         ('StreetName', 'ST. ALBANS STREET')
     ])
コード例 #6
0
 def test_only_organisation(self):
     assert parser.parse('Statistics Ltd') == [('Statistics',
                                                'OrganisationName'),
                                               ('Ltd', 'OrganisationName')]
     assert parser.tag('THE GLENSIDE HOSPITAL FOR NEURO REHABILITATION') == \
            OrderedDict([('OrganisationName', 'THE GLENSIDE HOSPITAL FOR NEURO REHABILITATION')])
     assert parser.tag('st albans care home') == OrderedDict([
         ('OrganisationName', 'st albans care home')
     ])
     assert parser.tag('HILLTOP CARE HOME') == OrderedDict([
         ('OrganisationName', 'HILLTOP CARE HOME')
     ])
     assert parser.tag('SANDYLEAZE CARE HOME') == OrderedDict([
         ('OrganisationName', 'SANDYLEAZE CARE HOME')
     ])
     assert parser.tag('ST. MARGARETS RESIDENTIAL HOME') == OrderedDict([
         ('OrganisationName', 'ST. MARGARETS RESIDENTIAL HOME')
     ])
     assert parser.tag('WOODCROFT HOSPITAL') == OrderedDict([
         ('OrganisationName', 'WOODCROFT HOSPITAL')
     ])
     assert parser.tag('durham university') == OrderedDict([
         ('OrganisationName', 'durham university')
     ])
     assert parser.tag('best hotel') == OrderedDict([('OrganisationName',
                                                      'best hotel')])
     assert parser.tag('SUNNYBANK Bed and Breakfast') == OrderedDict([
         ('OrganisationName', 'SUNNYBANK Bed and Breakfast')
     ])
     assert parser.tag('College of St Barnabas') == OrderedDict([
         ('OrganisationName', 'College of St Barnabas')
     ])
     assert parser.tag('Maiden Law Hospital') == OrderedDict([
         ('OrganisationName', 'Maiden Law Hospital')
     ])
     assert parser.tag('Ley Community Drug Services') == OrderedDict([
         ('OrganisationName', 'Ley Community Drug Services')
     ])
コード例 #7
0
    def test_addresses(self):
        assert parser.tag('FLAT 1 7 DENZIL AVENUE SOUTHAMPTON') == OrderedDict(
            [('SubBuildingName', 'FLAT 1'), ('BuildingNumber', '7'),
             ('StreetName', 'DENZIL AVENUE'), ('TownName', 'SOUTHAMPTON')])
        assert parser.tag('NIGHTINGALES RESIDENTIAL HOME WOLVERLEY COURT WOLVERLEY ' +
                          'ROAD WOLVERLEY KIDDERMINSTER DY10 3RP') == \
               OrderedDict([('OrganisationName', 'NIGHTINGALES RESIDENTIAL HOME'),
                            ('BuildingName', 'WOLVERLEY COURT'),
                            ('StreetName', 'WOLVERLEY ROAD'),
                            ('Locality', 'WOLVERLEY'),
                            ('TownName', 'KIDDERMINSTER'),
                            ('Postcode', 'DY10 3RP')])
        assert parser.tag('12 ST ALBANS ROAD WATFORD WD17 1UN') == OrderedDict(
            [('BuildingNumber', '12'), ('StreetName', 'ST ALBANS ROAD'),
             ('TownName', 'WATFORD'), ('Postcode', 'WD17 1UN')])
        assert parser.tag(
            'FLAT 30 68 VINCENT SQUARE LONDON SW1P 2NZ') == OrderedDict([
                ('SubBuildingName', 'FLAT 30'), ('BuildingNumber', '68'),
                ('StreetName', 'VINCENT SQUARE'), ('TownName', 'LONDON'),
                ('Postcode', 'SW1P 2NZ')
            ])
        assert parser.tag('FLAT 4.5.3 LIBERTY QUAYS BLAKE AVENUE GILLINGHAM') == \
               OrderedDict([('SubBuildingName', 'FLAT 4.5.3'),
                            ('BuildingName', 'LIBERTY QUAYS'),
                            ('StreetName', 'BLAKE AVENUE'),
                            ('TownName', 'GILLINGHAM')])

        assert parser.tag('STUDIO 1.2 BLOCK J BIRKS HALLS NEW NORTH ROAD EXETER EX4 4ZZ') == \
               OrderedDict([('SubBuildingName', 'STUDIO 1.2 BLOCK J'),
                            ('BuildingName', 'BIRKS HALLS'),
                            ('StreetName', 'NEW NORTH ROAD'),
                            ('TownName', 'EXETER'),
                            ('Postcode', 'EX4 4ZZ')])
        assert parser.tag('FLAT 50 BECK MILL COURT BECK MILL STREET MELTON MOWBRAY LE13 1PT') == \
               OrderedDict([('SubBuildingName', 'FLAT 50'),
                            ('BuildingName', 'BECK MILL COURT'),
                            ('StreetName', 'BECK MILL STREET'),
                            ('TownName', 'MELTON MOWBRAY'),
                            ('Postcode', 'LE13 1PT')])
        assert parser.tag('24 high street street ba16 0eb') == OrderedDict([
            ('BuildingNumber', '24'), ('StreetName', 'high street'),
            ('TownName', 'street'), ('Postcode', 'ba16 0eb')
        ])
        assert parser.tag('COLONIA COURT RESIDENTIAL AND NURSING HOME ST. ANDREWS AVENUE COLCHESTER CO4 3AN') == \
               OrderedDict([('OrganisationName',
                             'COLONIA COURT RESIDENTIAL AND NURSING HOME'),
                            ('StreetName', 'ST. ANDREWS AVENUE'),
                            ('TownName', 'COLCHESTER'),
                            ('Postcode', 'CO4 3AN')])
        assert parser.tag('FLAT 51 SHAKESPEARE HOUSE NORTH CHURCH STREET NOTTINGHAM NG1 4BR') == \
               OrderedDict([('SubBuildingName', 'FLAT 51'),
                            ('BuildingName', 'SHAKESPEARE HOUSE'),
                            ('StreetName', 'NORTH CHURCH STREET'),
                            ('TownName', 'NOTTINGHAM'),
                            ('Postcode', 'NG1 4BR')])
        assert parser.tag('18 beech road street ba16') == OrderedDict([
            ('BuildingNumber', '18'), ('StreetName', 'beech road'),
            ('TownName', 'street'), ('Postcode', 'ba16')
        ])
        assert parser.tag('1 brooks road street ba16 0pp') == OrderedDict([
            ('BuildingNumber', '1'), ('StreetName', 'brooks road'),
            ('TownName', 'street'), ('Postcode', 'ba16 0pp')
        ])
        assert parser.tag('BASEMENT FLAT 28 ALEXANDRA ROAD POOLE BH14') == \
               OrderedDict([('SubBuildingName', 'BASEMENT FLAT'),
                            ('BuildingNumber', '28'),
                            ('StreetName', 'ALEXANDRA ROAD'),
                            ('TownName', 'POOLE'),
                            ('Postcode', 'BH14')])
        assert parser.tag('FLAT 14.12 ARAGON TOWER GEORGE BEARD ROAD LONDON') == \
               OrderedDict([('SubBuildingName', 'FLAT 14.12'),
                            ('BuildingName', 'ARAGON TOWER'),
                            ('StreetName', 'GEORGE BEARD ROAD'),
                            ('TownName', 'LONDON')])
        assert parser.tag('ROYAL MENCAP SOCIETY 15-17 KEW GARDENS BOGNOR REGIS PO21 5RD') == \
               OrderedDict([('OrganisationName', 'ROYAL MENCAP SOCIETY'),
                            ('BuildingName', '15-17'),
                            ('StreetName', 'KEW GARDENS'),
                            ('TownName', 'BOGNOR REGIS'),
                            ('Postcode', 'PO21 5RD')])
コード例 #8
0
    def parse(self, data, normalised_field_name='ADDRESS_norm'):
        """
        Parse the address information given in the data.

        Assumes that the address information is stored in columned named 'ADDRESS'.

        :param data: address data containing a column 'ADDRESS' to parse
        :type data: pandas.DataFrame
        :param normalised_field_name: name of the new field to contain normalised address data
        :type normalised_field_name: str

        :return: parsed address data
        :rtype: pandas.DataFrame
        """
        self.log.info('Start parsing address data...')

        data = self._normalize_input_data(
            data, normalised_field_name=normalised_field_name)

        addresses = data[normalised_field_name].values
        self.log.info('{} addresses to parse...'.format(len(addresses)))

        # temp data storage lists
        organisation = []
        department = []
        sub_building = []
        building_name = []
        building_number = []
        street = []
        locality = []
        town = []
        postcode = []

        # loop over addresses and use the probabilistic parser to tag the address components - should avoid a loop
        for address in tqdm(addresses):
            parsed = parser.tag(address.upper())
            possible_postcode = self._extract_postcode(
                address)  # regular expression extraction

            # if both parsers found postcode then check that they are the same
            if parsed.get('Postcode',
                          None) is not None and possible_postcode is not None:
                if parsed['Postcode'] != possible_postcode:
                    # not the same, use possible_postcode
                    parsed['Postcode'] = possible_postcode

            # if the probabilistic parser did not find postcode but regular expression did, then use that
            if parsed.get('Postcode',
                          None) is None and possible_postcode is not None:
                parsed['Postcode'] = possible_postcode

            if parsed.get('Postcode', None) is not None:
                # check that there is space, if not then add if the parsed postcode is long enough to contain a complete
                # postcode. Some users have partial postcodes to which one should not add a space.
                if ' ' not in parsed['Postcode'] and len(
                        parsed['Postcode']) > 4:
                    in_code = parsed['Postcode'][-3:]
                    out_code = parsed['Postcode'].replace(in_code, '')
                    parsed['Postcode'] = out_code + ' ' + in_code

                # change to all capitals
                parsed['Postcode'] = parsed['Postcode'].upper()

            # if Hackney etc. in StreetName then remove and move to locality if town name contains London
            # Probabilistic parser should see more cases with london localities, parsed incorrectly at the mo
            if parsed.get('StreetName', None) is not None and parsed.get(
                    'TownName', None) is not None:
                if 'LONDON' in parsed['TownName']:
                    parsed = self._fix_london_boroughs(
                        parsed,
                        os.path.join(self.currentDirectory, '../../data/'))

            # sometimes building number gets placed at building name, take it and add to building name
            if parsed.get('BuildingNumber', None) is None and parsed.get(
                    'BuildingName', None) is not None:
                tmp = parsed['BuildingName'].split(' ')
                if len(tmp) > 1:
                    try:
                        _ = int(tmp[0])
                        parsed['BuildingNumber'] = tmp[0]
                    except ValueError:
                        pass

            # some addresses contain place CO place, where the CO is not part of the actual name - remove these
            # same is true for IN e.g. Road Marton IN Cleveland
            if parsed.get('Locality', None) is not None:
                if parsed['Locality'].strip().endswith(' CO'):
                    parsed['Locality'] = parsed['Locality'].replace(' CO', '')
                if parsed['Locality'].strip().endswith(' IN'):
                    parsed['Locality'] = parsed['Locality'].replace(' IN', '')

            # parser sometimes places house to organisation name, while it is likelier that it should be subBuilding
            if parsed.get('OrganisationName') == 'HOUSE' and parsed.get(
                    'SubBuildingName', None) is None:
                parsed['SubBuildingName'] = parsed.get('OrganisationName')

            # store the parsed information to separate lists
            organisation.append(parsed.get('OrganisationName', None))
            department.append(parsed.get('DepartmentName', None))
            sub_building.append(parsed.get('SubBuildingName', None))
            building_name.append(parsed.get('BuildingName', None))
            building_number.append(parsed.get('BuildingNumber', None))
            street.append(parsed.get('StreetName', None))
            locality.append(parsed.get('Locality', None))
            town.append(parsed.get('TownName', None))
            postcode.append(parsed.get('Postcode', None))

        # add the parsed information to the dataframe
        data['OrganisationName'] = organisation
        data['DepartmentName'] = department
        data['SubBuildingName'] = sub_building
        data['BuildingName'] = building_name
        data['BuildingNumber'] = building_number
        data['StreetName'] = street
        data['Locality'] = locality
        data['TownName'] = town
        data['Postcode'] = postcode
        data['PAOText'] = data['BuildingName'].copy()
        data['SAOText'] = data['SubBuildingName'].copy()

        data = self._parser_postprocessing(data)

        return data