Beispiel #1
0
 def test_strip_spaces(self):
     res = strip_spaces(
         remove_stopwords(
             expand_abbreviations(
                 separate_postcode(normalise(single_line(
                     self.address)))[0])))
     self.assertEqual(res, '3BRISLEEAVENUENORTHSHIELDS')
Beispiel #2
0
    def test_jaccard_index(self):
        a1 = strip_spaces(
            remove_stopwords(
                expand_abbreviations(
                    normalise('Flat 22, 8 St. Andrews Cross, PLYMOUTH'))))
        a2 = strip_spaces(
            remove_stopwords(
                expand_abbreviations(
                    normalise('Flat 22, 8 St. Andrews Cross, PLYMOUTH'))))
        self.assertEqual(jaccard_index(a1, a2), 1)

        a3 = ''
        self.assertEqual(jaccard_index(a1, a3), 0)

        a4 = strip_spaces(
            remove_stopwords(
                expand_abbreviations(
                    normalise('Flat 22, 8 St. Andrews Cross'))))
        self.assertGreater(jaccard_index(a1, a4), 0.7)

        a5 = strip_spaces(
            remove_stopwords(expand_abbreviations(normalise('Flat 22'))))
        self.assertLess(jaccard_index(a1, a5), 0.3)
Beispiel #3
0
    to_match = sys.argv[1]
    res = separate_postcode(normalise(to_match))
    postcode = res[1]
    to_match = res[0]
    if postcode:
        addresses = db.addresses.find({'postcode': postcode})
    else:
        res = re.split('\W+', to_match)
        address = ' '.join(res[:min(len(res), 4)])
        addresses = db.addresses.find(
            {'phonetic': {
                '$regex': '^' + phonetic(address)
            }})

    to_match = strip_spaces(remove_stopwords(expand_abbreviations(to_match)))
    best_jaccard = 0
    best_match = list()
    for address in addresses:
        lines = paf_to_lines(address)
        line = separate_postcode(normalise(single_line(lines)))
        line = strip_spaces(remove_stopwords(expand_abbreviations(line[0])))
        idx = jaccard_index(to_match, line)
        if idx > best_jaccard:
            best_jaccard = idx
            best_match = list()
            best_match.append(address)
        elif idx == best_jaccard:
            best_match.append(address)

    if len(best_match) == 1:
Beispiel #4
0
 def test_expand_abbreviations(self):
     res = expand_abbreviations(
         separate_postcode(normalise(single_line(self.address)))[0])
     self.assertEqual(res, '3 THE BRISLEE AVENUE NORTH SHIELDS')