def test_strip_spaces(self): res = strip_spaces( remove_stopwords( expand_abbreviations( separate_postcode(normalise(single_line( self.address)))[0]))) self.assertEqual(res, '3BRISLEEAVENUENORTHSHIELDS')
def test_jaccard_index(self): a1 = strip_spaces( remove_stopwords( expand_abbreviations( normalise('Flat 22, 8 St. Andrews Cross, PLYMOUTH')))) a2 = strip_spaces( remove_stopwords( expand_abbreviations( normalise('Flat 22, 8 St. Andrews Cross, PLYMOUTH')))) self.assertEqual(jaccard_index(a1, a2), 1) a3 = '' self.assertEqual(jaccard_index(a1, a3), 0) a4 = strip_spaces( remove_stopwords( expand_abbreviations( normalise('Flat 22, 8 St. Andrews Cross')))) self.assertGreater(jaccard_index(a1, a4), 0.7) a5 = strip_spaces( remove_stopwords(expand_abbreviations(normalise('Flat 22')))) self.assertLess(jaccard_index(a1, a5), 0.3)
from addressutils import remove_stopwords from addressutils import strip_spaces from addressutils import jaccard_index if __name__ == '__main__': config = configparser.ConfigParser() config.read('addressutils.cfg') client = MongoClient(config['DATABASE']['dbURI']) db = client[config['DATABASE']['dbName']] if len(sys.argv) < 2: print('>> PLEASE PROVIDE AN ADDRESS TO MATCH') sys.exit(0) to_match = sys.argv[1] res = separate_postcode(normalise(to_match)) postcode = res[1] to_match = res[0] if postcode: addresses = db.addresses.find({'postcode': postcode}) else: res = re.split('\W+', to_match) address = ' '.join(res[:min(len(res), 4)]) addresses = db.addresses.find( {'phonetic': { '$regex': '^' + phonetic(address) }}) to_match = strip_spaces(remove_stopwords(expand_abbreviations(to_match))) best_jaccard = 0 best_match = list()
def test_expand_abbreviations(self): res = expand_abbreviations( separate_postcode(normalise(single_line(self.address)))[0]) self.assertEqual(res, '3 THE BRISLEE AVENUE NORTH SHIELDS')
def test_separate_postcode(self): res = separate_postcode(normalise(single_line(self.address))) self.assertEqual(res[0], '3 THE BRISLEE AVE NORTH SHIELDS') self.assertEqual(res[1], 'NE30 2SQ')
def test_normalise(self): res = normalise(single_line(self.address)) self.assertEqual(res, '3 THE BRISLEE AVE NORTH SHIELDS NE30 2SQ')