def get_similarity(self, test_string=''): """ Custom similarity for train stations - takes into account fact many people use abbreviated names """ # For low-scoring matches, we try matching between a string the same size as the user query, if its shorter than the name # being tested against, so this works for e.g. Kings Cross matching King's Cross St Pancras score = get_name_similarity(self.name, test_string) if len(test_string) < len(self.name): abbreviated_score = get_name_similarity(self.name[:len(test_string)], test_string) if abbreviated_score >= 85 and abbreviated_score > score: return min(abbreviated_score, 99) # Never 100, in case it overrides an exact match return score
def get_similarity(self, test_string=''): """ Custom similarity match for bus stops - takes into account many of them will be from train stations or bus stations """ # Use the above function to normalise our names and facilitate easier comparison my_name = self.get_normalised_name() their_name = BusStop(test_string).get_normalised_name() # Exact match is obviously best if my_name == their_name: return 100 # If user has specified a station or bus station, then a partial match at start or end of string works for us # We prioritise, just slightly, names that have the match at the beginning if re.search("(BUS)?STN", their_name): if my_name.startswith(their_name): return 95 if my_name.endswith(their_name): return 94 # If on the other hand, we add station or bus station to their name and it matches, that's also pretty good if re.search("^%s(BUS)?STN" % their_name, my_name): return 91 if re.search("%s(BUS)?STN$" % their_name, my_name): return 90 # Else fall back on name similarity return get_name_similarity(my_name, their_name)
def test_stringutils(self): """ Unit test for stringutils' methods """ # Check capwords capitalised_strings = ("Bank", "Morden East", "King's Cross St. Pancras", "Kennington Oval via Charing X") for test_string in capitalised_strings: self.assertEqual(test_string, capwords(test_string)) self.assertEqual(test_string, capwords(test_string.lower())) self.assertEqual(test_string, capwords(test_string.upper())) self.assertNotEqual(test_string.lower(), capwords(test_string)) self.assertNotEqual(test_string.upper(), capwords(test_string)) # Check to see cleanup string is working random_string = lambda a, b: "".join([chr(random.Random().randint(a, b)) for _i in range(0, 10)]) dirty_strings = [random_string(48, 122) for _i in range(0, 10)] undesirables = ("a", "b+", "[0-9]", "^x") for dirty_string in dirty_strings: cleaned_string = cleanup_name_from_undesirables(dirty_string, undesirables) for undesirable in undesirables: self.assertIsNone(re.search(undesirable, cleaned_string, flags=re.I)) # Check string similarities - 100 for identical strings, 90 or more for one character change # and nothing at all for a totally unidentical string similarity_string = random_string(65, 122) self.assertEqual(get_name_similarity(similarity_string, similarity_string), 100) self.assertGreaterEqual(get_name_similarity(similarity_string, similarity_string[:-1]), 90) self.assertEqual(get_name_similarity(similarity_string, random_string(48, 57)), 0) # Check to see most similar string gets picked out of an list of similar-looking strings, and that # with very dissimilar strings, there is no candidate at all similarity_candidates = (similarity_string[:3], similarity_string[:5], similarity_string[:9], "z" * 10) self.assertEqual(get_best_fuzzy_match(similarity_string, similarity_candidates), similarity_candidates[-2]) dissimilarity_candidates = [random_string(48, 57) for _i in range(0, 10)] self.assertIsNone(get_best_fuzzy_match(similarity_string, dissimilarity_candidates)) if time.localtime().tm_isdst: self.assertEqual(gmt_to_localtime("2359"), "0059") self.assertEqual(gmt_to_localtime("23:59"), "0059") self.assertEqual(gmt_to_localtime("Tue 00:01"), "0101") else: self.assertEqual(gmt_to_localtime("2359"), "2359") self.assertEqual(gmt_to_localtime("23:59"), "2359") self.assertEqual(gmt_to_localtime("Tue 00:01"), "0001")