def test_only_building_name(self): assert parser.tag('5C') == OrderedDict([('BuildingName', '5C')]) assert parser.tag('Victorian House') == OrderedDict([ ('BuildingName', 'Victorian House') ]) assert parser.tag('SHAKESPEARE HOUSE') == OrderedDict([ ('BuildingName', 'SHAKESPEARE HOUSE') ])
def test_only_sub_building_name(self): assert parser.tag('Flat 5') == OrderedDict([('SubBuildingName', 'Flat 5')]) assert parser.tag('Apartment 1C') == OrderedDict([('SubBuildingName', 'Apartment 1C')]) assert parser.tag('Unit A') == OrderedDict([('SubBuildingName', 'Unit A')]) assert parser.tag('Unit C3') == OrderedDict([('SubBuildingName', 'Unit C3')])
def test_only_town(self): assert parser.parse('Oxford') == [('Oxford', 'TownName')] assert parser.tag('STOKE-ON-TRENT') == OrderedDict([ ('TownName', 'STOKE-ON-TRENT') ]) assert parser.tag('ABERTAWE') == OrderedDict([('TownName', 'ABERTAWE') ]) assert parser.tag('CHESTER LE STREET') == OrderedDict([ ('TownName', 'CHESTER LE STREET') ]) assert parser.tag('CASNEWYDD') == OrderedDict([('TownName', 'CASNEWYDD')])
def test_only_postcode(self): assert parser.parse('RH1 2FW') == [('RH1', 'Postcode'), ('2FW', 'Postcode')] assert parser.parse('RH12FW') == [('RH12FW', 'Postcode')] assert parser.parse('L1 1XX') == [('L1', 'Postcode'), ('1XX', 'Postcode')] assert parser.parse('KT18') == [('KT18', 'Postcode')] assert parser.parse('SW1P') == [('SW1P', 'Postcode')] assert parser.tag('WF11 9ZZ') == OrderedDict([('Postcode', 'WF11 9ZZ') ]) assert parser.tag('EC1N 8QX') == OrderedDict([('Postcode', 'EC1N 8QX') ]) assert parser.tag('EC1N8QX') == OrderedDict([('Postcode', 'EC1N8QX')]) assert parser.tag('SY23 3SR') == OrderedDict([('Postcode', 'SY23 3SR') ]) assert parser.tag('SY233SR') == OrderedDict([('Postcode', 'SY233SR')])
def test_only_street_name(self): assert parser.tag('Oxford Road') == OrderedDict([('StreetName', 'Oxford Road')]) assert parser.tag('Regent Street') == OrderedDict([('StreetName', 'Regent Street')]) assert parser.tag('NORFOLK DRIVE') == OrderedDict([('StreetName', 'NORFOLK DRIVE')]) assert parser.tag('LONDON ROAD') == OrderedDict([('StreetName', 'LONDON ROAD')]) assert parser.tag('ST. JAMES STREET') == OrderedDict([ ('StreetName', 'ST. JAMES STREET') ]) assert parser.tag('ST. ALBANS STREET') == OrderedDict([ ('StreetName', 'ST. ALBANS STREET') ])
def test_only_organisation(self): assert parser.parse('Statistics Ltd') == [('Statistics', 'OrganisationName'), ('Ltd', 'OrganisationName')] assert parser.tag('THE GLENSIDE HOSPITAL FOR NEURO REHABILITATION') == \ OrderedDict([('OrganisationName', 'THE GLENSIDE HOSPITAL FOR NEURO REHABILITATION')]) assert parser.tag('st albans care home') == OrderedDict([ ('OrganisationName', 'st albans care home') ]) assert parser.tag('HILLTOP CARE HOME') == OrderedDict([ ('OrganisationName', 'HILLTOP CARE HOME') ]) assert parser.tag('SANDYLEAZE CARE HOME') == OrderedDict([ ('OrganisationName', 'SANDYLEAZE CARE HOME') ]) assert parser.tag('ST. MARGARETS RESIDENTIAL HOME') == OrderedDict([ ('OrganisationName', 'ST. MARGARETS RESIDENTIAL HOME') ]) assert parser.tag('WOODCROFT HOSPITAL') == OrderedDict([ ('OrganisationName', 'WOODCROFT HOSPITAL') ]) assert parser.tag('durham university') == OrderedDict([ ('OrganisationName', 'durham university') ]) assert parser.tag('best hotel') == OrderedDict([('OrganisationName', 'best hotel')]) assert parser.tag('SUNNYBANK Bed and Breakfast') == OrderedDict([ ('OrganisationName', 'SUNNYBANK Bed and Breakfast') ]) assert parser.tag('College of St Barnabas') == OrderedDict([ ('OrganisationName', 'College of St Barnabas') ]) assert parser.tag('Maiden Law Hospital') == OrderedDict([ ('OrganisationName', 'Maiden Law Hospital') ]) assert parser.tag('Ley Community Drug Services') == OrderedDict([ ('OrganisationName', 'Ley Community Drug Services') ])
def test_addresses(self): assert parser.tag('FLAT 1 7 DENZIL AVENUE SOUTHAMPTON') == OrderedDict( [('SubBuildingName', 'FLAT 1'), ('BuildingNumber', '7'), ('StreetName', 'DENZIL AVENUE'), ('TownName', 'SOUTHAMPTON')]) assert parser.tag('NIGHTINGALES RESIDENTIAL HOME WOLVERLEY COURT WOLVERLEY ' + 'ROAD WOLVERLEY KIDDERMINSTER DY10 3RP') == \ OrderedDict([('OrganisationName', 'NIGHTINGALES RESIDENTIAL HOME'), ('BuildingName', 'WOLVERLEY COURT'), ('StreetName', 'WOLVERLEY ROAD'), ('Locality', 'WOLVERLEY'), ('TownName', 'KIDDERMINSTER'), ('Postcode', 'DY10 3RP')]) assert parser.tag('12 ST ALBANS ROAD WATFORD WD17 1UN') == OrderedDict( [('BuildingNumber', '12'), ('StreetName', 'ST ALBANS ROAD'), ('TownName', 'WATFORD'), ('Postcode', 'WD17 1UN')]) assert parser.tag( 'FLAT 30 68 VINCENT SQUARE LONDON SW1P 2NZ') == OrderedDict([ ('SubBuildingName', 'FLAT 30'), ('BuildingNumber', '68'), ('StreetName', 'VINCENT SQUARE'), ('TownName', 'LONDON'), ('Postcode', 'SW1P 2NZ') ]) assert parser.tag('FLAT 4.5.3 LIBERTY QUAYS BLAKE AVENUE GILLINGHAM') == \ OrderedDict([('SubBuildingName', 'FLAT 4.5.3'), ('BuildingName', 'LIBERTY QUAYS'), ('StreetName', 'BLAKE AVENUE'), ('TownName', 'GILLINGHAM')]) assert parser.tag('STUDIO 1.2 BLOCK J BIRKS HALLS NEW NORTH ROAD EXETER EX4 4ZZ') == \ OrderedDict([('SubBuildingName', 'STUDIO 1.2 BLOCK J'), ('BuildingName', 'BIRKS HALLS'), ('StreetName', 'NEW NORTH ROAD'), ('TownName', 'EXETER'), ('Postcode', 'EX4 4ZZ')]) assert parser.tag('FLAT 50 BECK MILL COURT BECK MILL STREET MELTON MOWBRAY LE13 1PT') == \ OrderedDict([('SubBuildingName', 'FLAT 50'), ('BuildingName', 'BECK MILL COURT'), ('StreetName', 'BECK MILL STREET'), ('TownName', 'MELTON MOWBRAY'), ('Postcode', 'LE13 1PT')]) assert parser.tag('24 high street street ba16 0eb') == OrderedDict([ ('BuildingNumber', '24'), ('StreetName', 'high street'), ('TownName', 'street'), ('Postcode', 'ba16 0eb') ]) assert parser.tag('COLONIA COURT RESIDENTIAL AND NURSING HOME ST. ANDREWS AVENUE COLCHESTER CO4 3AN') == \ OrderedDict([('OrganisationName', 'COLONIA COURT RESIDENTIAL AND NURSING HOME'), ('StreetName', 'ST. ANDREWS AVENUE'), ('TownName', 'COLCHESTER'), ('Postcode', 'CO4 3AN')]) assert parser.tag('FLAT 51 SHAKESPEARE HOUSE NORTH CHURCH STREET NOTTINGHAM NG1 4BR') == \ OrderedDict([('SubBuildingName', 'FLAT 51'), ('BuildingName', 'SHAKESPEARE HOUSE'), ('StreetName', 'NORTH CHURCH STREET'), ('TownName', 'NOTTINGHAM'), ('Postcode', 'NG1 4BR')]) assert parser.tag('18 beech road street ba16') == OrderedDict([ ('BuildingNumber', '18'), ('StreetName', 'beech road'), ('TownName', 'street'), ('Postcode', 'ba16') ]) assert parser.tag('1 brooks road street ba16 0pp') == OrderedDict([ ('BuildingNumber', '1'), ('StreetName', 'brooks road'), ('TownName', 'street'), ('Postcode', 'ba16 0pp') ]) assert parser.tag('BASEMENT FLAT 28 ALEXANDRA ROAD POOLE BH14') == \ OrderedDict([('SubBuildingName', 'BASEMENT FLAT'), ('BuildingNumber', '28'), ('StreetName', 'ALEXANDRA ROAD'), ('TownName', 'POOLE'), ('Postcode', 'BH14')]) assert parser.tag('FLAT 14.12 ARAGON TOWER GEORGE BEARD ROAD LONDON') == \ OrderedDict([('SubBuildingName', 'FLAT 14.12'), ('BuildingName', 'ARAGON TOWER'), ('StreetName', 'GEORGE BEARD ROAD'), ('TownName', 'LONDON')]) assert parser.tag('ROYAL MENCAP SOCIETY 15-17 KEW GARDENS BOGNOR REGIS PO21 5RD') == \ OrderedDict([('OrganisationName', 'ROYAL MENCAP SOCIETY'), ('BuildingName', '15-17'), ('StreetName', 'KEW GARDENS'), ('TownName', 'BOGNOR REGIS'), ('Postcode', 'PO21 5RD')])
def parse(self, data, normalised_field_name='ADDRESS_norm'): """ Parse the address information given in the data. Assumes that the address information is stored in columned named 'ADDRESS'. :param data: address data containing a column 'ADDRESS' to parse :type data: pandas.DataFrame :param normalised_field_name: name of the new field to contain normalised address data :type normalised_field_name: str :return: parsed address data :rtype: pandas.DataFrame """ self.log.info('Start parsing address data...') data = self._normalize_input_data( data, normalised_field_name=normalised_field_name) addresses = data[normalised_field_name].values self.log.info('{} addresses to parse...'.format(len(addresses))) # temp data storage lists organisation = [] department = [] sub_building = [] building_name = [] building_number = [] street = [] locality = [] town = [] postcode = [] # loop over addresses and use the probabilistic parser to tag the address components - should avoid a loop for address in tqdm(addresses): parsed = parser.tag(address.upper()) possible_postcode = self._extract_postcode( address) # regular expression extraction # if both parsers found postcode then check that they are the same if parsed.get('Postcode', None) is not None and possible_postcode is not None: if parsed['Postcode'] != possible_postcode: # not the same, use possible_postcode parsed['Postcode'] = possible_postcode # if the probabilistic parser did not find postcode but regular expression did, then use that if parsed.get('Postcode', None) is None and possible_postcode is not None: parsed['Postcode'] = possible_postcode if parsed.get('Postcode', None) is not None: # check that there is space, if not then add if the parsed postcode is long enough to contain a complete # postcode. Some users have partial postcodes to which one should not add a space. if ' ' not in parsed['Postcode'] and len( parsed['Postcode']) > 4: in_code = parsed['Postcode'][-3:] out_code = parsed['Postcode'].replace(in_code, '') parsed['Postcode'] = out_code + ' ' + in_code # change to all capitals parsed['Postcode'] = parsed['Postcode'].upper() # if Hackney etc. in StreetName then remove and move to locality if town name contains London # Probabilistic parser should see more cases with london localities, parsed incorrectly at the mo if parsed.get('StreetName', None) is not None and parsed.get( 'TownName', None) is not None: if 'LONDON' in parsed['TownName']: parsed = self._fix_london_boroughs( parsed, os.path.join(self.currentDirectory, '../../data/')) # sometimes building number gets placed at building name, take it and add to building name if parsed.get('BuildingNumber', None) is None and parsed.get( 'BuildingName', None) is not None: tmp = parsed['BuildingName'].split(' ') if len(tmp) > 1: try: _ = int(tmp[0]) parsed['BuildingNumber'] = tmp[0] except ValueError: pass # some addresses contain place CO place, where the CO is not part of the actual name - remove these # same is true for IN e.g. Road Marton IN Cleveland if parsed.get('Locality', None) is not None: if parsed['Locality'].strip().endswith(' CO'): parsed['Locality'] = parsed['Locality'].replace(' CO', '') if parsed['Locality'].strip().endswith(' IN'): parsed['Locality'] = parsed['Locality'].replace(' IN', '') # parser sometimes places house to organisation name, while it is likelier that it should be subBuilding if parsed.get('OrganisationName') == 'HOUSE' and parsed.get( 'SubBuildingName', None) is None: parsed['SubBuildingName'] = parsed.get('OrganisationName') # store the parsed information to separate lists organisation.append(parsed.get('OrganisationName', None)) department.append(parsed.get('DepartmentName', None)) sub_building.append(parsed.get('SubBuildingName', None)) building_name.append(parsed.get('BuildingName', None)) building_number.append(parsed.get('BuildingNumber', None)) street.append(parsed.get('StreetName', None)) locality.append(parsed.get('Locality', None)) town.append(parsed.get('TownName', None)) postcode.append(parsed.get('Postcode', None)) # add the parsed information to the dataframe data['OrganisationName'] = organisation data['DepartmentName'] = department data['SubBuildingName'] = sub_building data['BuildingName'] = building_name data['BuildingNumber'] = building_number data['StreetName'] = street data['Locality'] = locality data['TownName'] = town data['Postcode'] = postcode data['PAOText'] = data['BuildingName'].copy() data['SAOText'] = data['SubBuildingName'].copy() data = self._parser_postprocessing(data) return data