def parse_message(self, text): """ Parses the text and returns a tuple of (routes, origin, destination). routes is a list of strings; origin and destination strings """ # Get tokens, tag them and remove any tagged with None logging.debug("Parsing message: '%s'", text) if not text: logging.debug("Message is empty, returning nothing") return (None, None, None, None) tokenizer = nltk.tokenize.regexp.WhitespaceTokenizer() tokens = tokenizer.tokenize(text.lower()) tagged_tokens = [(word, tag) for (word, tag) in self.tagger.tag(tokens) if tag] # Some tags may be unknown type so we run a method on them to resolve such unknowns tagged_tokens = self.fix_unknown_tokens(tagged_tokens) # Parse the tree. If we cannot parse a legitimate request then return nothing parsed_tokens = self.parser.parse(tagged_tokens) if not subtree_exists(parsed_tokens, "REQUEST"): logging.debug("Message did not conform to message format, returning nothing") return (None, None, None, None) # Else extract the right tagged words from the parsed tree, applying capitalisation appropriately routes, origin, destination, direction = (None, None, None, None) for child in parsed_tokens.subtrees(): if child.node == "LINE_NAME": routes = extract_words(child, ("TUBE_LINE_WORD", "DLR_LINE_NAME", "AND", "CITY")) routes = " ".join(routes) or None if routes == "dlr": routes = [routes.upper()] elif routes: routes = [capwords(routes)] elif child.node == "BUS_ROUTES": routes = extract_words(child, ("ROUTE_NUMBER",)) routes = routes and [route.upper() for route in routes] or None elif child.node == "ORIGIN": origin = extract_words(child, ("STATION_WORD", "BUS_STOP_WORD", "BUS_STOP_NUMBER")) elif child.node == "DESTINATION": destination = extract_words(child, ("STATION_WORD", "BUS_STOP_WORD", "BUS_STOP_NUMBER")) # This one's a bit odd, but DIRECTION is always a leaf node of REQUEST elif child.node == "REQUEST": direction = extract_words(child, ("DIRECTION",)) origin = origin and capwords(" ".join(origin)) or None destination = destination and capwords(" ".join(destination)) or None direction = direction and capwords(" ".join(direction)) or None logging.debug( "Found routes %s from origin '%s' to destination '%s' in %s direction", routes, origin, destination, direction, ) return (routes, origin, destination, direction)
def test_stringutils(self): """ Unit test for stringutils' methods """ # Check capwords capitalised_strings = ("Bank", "Morden East", "King's Cross St. Pancras", "Kennington Oval via Charing X") for test_string in capitalised_strings: self.assertEqual(test_string, capwords(test_string)) self.assertEqual(test_string, capwords(test_string.lower())) self.assertEqual(test_string, capwords(test_string.upper())) self.assertNotEqual(test_string.lower(), capwords(test_string)) self.assertNotEqual(test_string.upper(), capwords(test_string)) # Check to see cleanup string is working random_string = lambda a, b: "".join([chr(random.Random().randint(a, b)) for _i in range(0, 10)]) dirty_strings = [random_string(48, 122) for _i in range(0, 10)] undesirables = ("a", "b+", "[0-9]", "^x") for dirty_string in dirty_strings: cleaned_string = cleanup_name_from_undesirables(dirty_string, undesirables) for undesirable in undesirables: self.assertIsNone(re.search(undesirable, cleaned_string, flags=re.I)) # Check string similarities - 100 for identical strings, 90 or more for one character change # and nothing at all for a totally unidentical string similarity_string = random_string(65, 122) self.assertEqual(get_name_similarity(similarity_string, similarity_string), 100) self.assertGreaterEqual(get_name_similarity(similarity_string, similarity_string[:-1]), 90) self.assertEqual(get_name_similarity(similarity_string, random_string(48, 57)), 0) # Check to see most similar string gets picked out of an list of similar-looking strings, and that # with very dissimilar strings, there is no candidate at all similarity_candidates = (similarity_string[:3], similarity_string[:5], similarity_string[:9], "z" * 10) self.assertEqual(get_best_fuzzy_match(similarity_string, similarity_candidates), similarity_candidates[-2]) dissimilarity_candidates = [random_string(48, 57) for _i in range(0, 10)] self.assertIsNone(get_best_fuzzy_match(similarity_string, dissimilarity_candidates)) if time.localtime().tm_isdst: self.assertEqual(gmt_to_localtime("2359"), "0059") self.assertEqual(gmt_to_localtime("23:59"), "0059") self.assertEqual(gmt_to_localtime("Tue 00:01"), "0101") else: self.assertEqual(gmt_to_localtime("2359"), "2359") self.assertEqual(gmt_to_localtime("23:59"), "2359") self.assertEqual(gmt_to_localtime("Tue 00:01"), "0001")
def parse_dlr_data(dlr_data, station): """ Takes a parsed XML elementTree dlr_data and the RailStation object for the station whose departures we are querying Returns a DepartureCollection object of all departures from the station in question, classified by platform """ train_info_regex = re.compile(r"[1-4] (\D+)(([0-9]+) mins?)?", flags=re.I) platforms_to_ignore = [('tog', 'P1'), ('wiq', 'P1')] platforms_to_ignore_if_empty = [('ban', 'P10'), ('str', 'P4B'), ('lew', 'P5')] # Go through each platform and get data about every train arriving, including which direction it's headed trains_by_platform = DepartureCollection() for platform in dlr_data.findall("div[@id='ttbox']"): # Get the platform number from image attached and the time published img = platform.find("div[@id='platformleft']/img") platform_name = img.attrib['src'].split('.')[0][:-1].upper() if (station.code, platform_name) in platforms_to_ignore: continue trains_by_platform[platform_name] = [] # Get trains for this platform info = platform.find("div[@id='platformmiddle']") publication_time = info.find("div[@id='time']").text.strip() publication_time = datetime.strptime(publication_time, "%H:%M") line1 = info.find("div[@id='line1']") line2 = info.find("div[@id='line23']/p") line3 = info.find("div[@id='line23']/p/br") trains = [line for line in (line1.text, line2.text, line3.tail) if line] # Go through trains, parse out the relevant data for train in trains: result = train_info_regex.search(train) if result: destination = capwords(result.group(1).strip()) if destination == 'Terminates Here': continue departure_delta = timedelta(minutes=(result.group(3) and int(result.group(3)) or 0)) departure_time = datetime.strftime(publication_time + departure_delta, "%H%M") trains_by_platform.add_to_slot(platform_name, DLRTrain(destination, departure_time)) logging.debug("Found a train going to %s at %s", destination, departure_time) else: logging.debug("Error - could not parse this line: %s", train) # If there are no trains in this platform to our specified stop, or it is a platform that can be ignored when it is empty # e.g. it is the "spare" platform at a terminus, then delete this platform entirely if not trains_by_platform[platform_name] and (station.code, platform_name) in platforms_to_ignore_if_empty: del trains_by_platform[platform_name] # If two platforms have exact same set of destinations, treat them as one by merging trains_by_platform.merge_common_slots() return trains_by_platform
def parse_tube_data(tube_data, station, line_code): """ Takes a parsed XML elementTree tube_data, the RailStation object for the station whose departures we are querying, and a string representing the one-character code for the line we want trains for Returns a DepartureCollection object of all departures from the station in question, classified by direction """ # Go through each platform and get data about every train arriving, including which direction it's headed trains_by_direction = DepartureCollection() publication_time = tube_data.find('WhenCreated').text publication_time = datetime.strptime(publication_time, "%d %b %Y %H:%M:%S") for platform in tube_data.findall('.//P'): platform_name = platform.attrib['N'] direction = re.search("(North|East|South|West)bound", platform_name, re.I) rail = re.search("(Inner|Outer) Rail", platform_name, re.I) # Most stations tell us whether they are -bound in a certain direction if direction: direction = capwords(direction.group(0)) # Some Circle/Central Line platforms called "Inner" and "Outer" Rail, which make no sense to customers, so I've manually # entered Inner and Outer attributes in the object (taken from the database) in the attribute circular_directions, # which translate from these into North/South/East/West elif rail: direction = station.circular_directions[rail.group(1).lower()] + 'bound' else: # Some odd cases. Chesham and Chalfont & Latimer don't say anything at all for the platforms on the Chesham branch of the Met Line if station.code == "CHM": direction = "Southbound" elif station.code == "CLF" and platform.attrib['Num'] == '3': direction = "Northbound" else: # The following stations will have "issues" with bidrectional platforms: North Acton, Edgware Road, Loughton, White City # These are dealt with by analysing the location of the destination by the calling WhensMyTrain object direction = "Unknown" logging.debug("Have encountered a platform without direction specified (%s)", platform_name) # Use the filter function to filter out trains that are out of service, specials or National Rail first platform_trains = platform.findall("T[@LN='%s']" % line_code) platform_trains = [train for train in platform_trains if filter_tube_train(train)] for train in platform_trains: # Create a TubeTrain object destination = train.attrib['Destination'] departure_delta = timedelta(seconds=int(train.attrib['SecondsTo'])) departure_time = datetime.strftime(publication_time + departure_delta, "%H%M") set_number = train.attrib['SetNo'] train_obj = TubeTrain(destination, direction, departure_time, line_code, set_number) trains_by_direction.add_to_slot(direction, train_obj) return trains_by_direction