Beispiel #1
0
    def parse_message(self, text):
        """
        Parses the text and returns a tuple of (routes, origin, destination). routes is a list of strings; origin and destination strings
        """
        # Get tokens, tag them and remove any tagged with None
        logging.debug("Parsing message: '%s'", text)
        if not text:
            logging.debug("Message is empty, returning nothing")
            return (None, None, None, None)
        tokenizer = nltk.tokenize.regexp.WhitespaceTokenizer()
        tokens = tokenizer.tokenize(text.lower())
        tagged_tokens = [(word, tag) for (word, tag) in self.tagger.tag(tokens) if tag]

        # Some tags may be unknown type so we run a method on them to resolve such unknowns
        tagged_tokens = self.fix_unknown_tokens(tagged_tokens)

        # Parse the tree. If we cannot parse a legitimate request then return nothing
        parsed_tokens = self.parser.parse(tagged_tokens)
        if not subtree_exists(parsed_tokens, "REQUEST"):
            logging.debug("Message did not conform to message format, returning nothing")
            return (None, None, None, None)

        # Else extract the right tagged words from the parsed tree, applying capitalisation appropriately
        routes, origin, destination, direction = (None, None, None, None)
        for child in parsed_tokens.subtrees():
            if child.node == "LINE_NAME":
                routes = extract_words(child, ("TUBE_LINE_WORD", "DLR_LINE_NAME", "AND", "CITY"))
                routes = " ".join(routes) or None
                if routes == "dlr":
                    routes = [routes.upper()]
                elif routes:
                    routes = [capwords(routes)]
            elif child.node == "BUS_ROUTES":
                routes = extract_words(child, ("ROUTE_NUMBER",))
                routes = routes and [route.upper() for route in routes] or None
            elif child.node == "ORIGIN":
                origin = extract_words(child, ("STATION_WORD", "BUS_STOP_WORD", "BUS_STOP_NUMBER"))
            elif child.node == "DESTINATION":
                destination = extract_words(child, ("STATION_WORD", "BUS_STOP_WORD", "BUS_STOP_NUMBER"))
            # This one's a bit odd, but DIRECTION is always a leaf node of REQUEST
            elif child.node == "REQUEST":
                direction = extract_words(child, ("DIRECTION",))

        origin = origin and capwords(" ".join(origin)) or None
        destination = destination and capwords(" ".join(destination)) or None
        direction = direction and capwords(" ".join(direction)) or None
        logging.debug(
            "Found routes %s from origin '%s' to destination '%s' in %s direction",
            routes,
            origin,
            destination,
            direction,
        )
        return (routes, origin, destination, direction)
Beispiel #2
0
    def test_stringutils(self):
        """
        Unit test for stringutils' methods
        """
        # Check capwords
        capitalised_strings = ("Bank", "Morden East", "King's Cross St. Pancras", "Kennington Oval via Charing X")
        for test_string in capitalised_strings:
            self.assertEqual(test_string, capwords(test_string))
            self.assertEqual(test_string, capwords(test_string.lower()))
            self.assertEqual(test_string, capwords(test_string.upper()))
            self.assertNotEqual(test_string.lower(), capwords(test_string))
            self.assertNotEqual(test_string.upper(), capwords(test_string))

        # Check to see cleanup string is working
        random_string = lambda a, b: "".join([chr(random.Random().randint(a, b)) for _i in range(0, 10)])
        dirty_strings = [random_string(48, 122) for _i in range(0, 10)]
        undesirables = ("a", "b+", "[0-9]", "^x")
        for dirty_string in dirty_strings:
            cleaned_string = cleanup_name_from_undesirables(dirty_string, undesirables)
            for undesirable in undesirables:
                self.assertIsNone(re.search(undesirable, cleaned_string, flags=re.I))

        # Check string similarities - 100 for identical strings, 90 or more for one character change
        # and nothing at all for a totally unidentical string
        similarity_string = random_string(65, 122)
        self.assertEqual(get_name_similarity(similarity_string, similarity_string), 100)
        self.assertGreaterEqual(get_name_similarity(similarity_string, similarity_string[:-1]), 90)
        self.assertEqual(get_name_similarity(similarity_string, random_string(48, 57)), 0)

        # Check to see most similar string gets picked out of an list of similar-looking strings, and that
        # with very dissimilar strings, there is no candidate at all
        similarity_candidates = (similarity_string[:3], similarity_string[:5], similarity_string[:9], "z" * 10)
        self.assertEqual(get_best_fuzzy_match(similarity_string, similarity_candidates), similarity_candidates[-2])
        dissimilarity_candidates = [random_string(48, 57) for _i in range(0, 10)]
        self.assertIsNone(get_best_fuzzy_match(similarity_string, dissimilarity_candidates))

        if time.localtime().tm_isdst:
            self.assertEqual(gmt_to_localtime("2359"), "0059")
            self.assertEqual(gmt_to_localtime("23:59"), "0059")
            self.assertEqual(gmt_to_localtime("Tue 00:01"), "0101")
        else:
            self.assertEqual(gmt_to_localtime("2359"), "2359")
            self.assertEqual(gmt_to_localtime("23:59"), "2359")
            self.assertEqual(gmt_to_localtime("Tue 00:01"), "0001")
def parse_dlr_data(dlr_data, station):
    """
    Takes a parsed XML elementTree dlr_data and the RailStation object for the station whose departures we are querying
    Returns a DepartureCollection object of all departures from the station in question, classified by platform
    """
    train_info_regex = re.compile(r"[1-4] (\D+)(([0-9]+) mins?)?", flags=re.I)
    platforms_to_ignore = [('tog', 'P1'),
                           ('wiq', 'P1')]
    platforms_to_ignore_if_empty = [('ban', 'P10'),
                                    ('str', 'P4B'),
                                    ('lew', 'P5')]

    # Go through each platform and get data about every train arriving, including which direction it's headed
    trains_by_platform = DepartureCollection()
    for platform in dlr_data.findall("div[@id='ttbox']"):
        # Get the platform number from image attached and the time published
        img = platform.find("div[@id='platformleft']/img")
        platform_name = img.attrib['src'].split('.')[0][:-1].upper()
        if (station.code, platform_name) in platforms_to_ignore:
            continue
        trains_by_platform[platform_name] = []

        # Get trains for this platform
        info = platform.find("div[@id='platformmiddle']")
        publication_time = info.find("div[@id='time']").text.strip()
        publication_time = datetime.strptime(publication_time, "%H:%M")
        line1 = info.find("div[@id='line1']")
        line2 = info.find("div[@id='line23']/p")
        line3 = info.find("div[@id='line23']/p/br")
        trains = [line for line in (line1.text, line2.text, line3.tail) if line]

        # Go through trains, parse out the relevant data
        for train in trains:
            result = train_info_regex.search(train)
            if result:
                destination = capwords(result.group(1).strip())
                if destination == 'Terminates Here':
                    continue
                departure_delta = timedelta(minutes=(result.group(3) and int(result.group(3)) or 0))
                departure_time = datetime.strftime(publication_time + departure_delta, "%H%M")
                trains_by_platform.add_to_slot(platform_name, DLRTrain(destination, departure_time))
                logging.debug("Found a train going to %s at %s", destination, departure_time)
            else:
                logging.debug("Error - could not parse this line: %s", train)

        # If there are no trains in this platform to our specified stop, or it is a platform that can be ignored when it is empty
        # e.g. it is the "spare" platform at a terminus, then delete this platform entirely
        if not trains_by_platform[platform_name] and (station.code, platform_name) in platforms_to_ignore_if_empty:
            del trains_by_platform[platform_name]

    # If two platforms have exact same set of destinations, treat them as one by merging
    trains_by_platform.merge_common_slots()
    return trains_by_platform
def parse_tube_data(tube_data, station, line_code):
    """
    Takes a parsed XML elementTree tube_data, the RailStation object for the station whose departures we are querying,
    and a string representing the one-character code for the line we want trains for

    Returns a DepartureCollection object of all departures from the station in question, classified by direction
    """
    # Go through each platform and get data about every train arriving, including which direction it's headed
    trains_by_direction = DepartureCollection()
    publication_time = tube_data.find('WhenCreated').text
    publication_time = datetime.strptime(publication_time, "%d %b %Y %H:%M:%S")
    for platform in tube_data.findall('.//P'):
        platform_name = platform.attrib['N']
        direction = re.search("(North|East|South|West)bound", platform_name, re.I)
        rail = re.search("(Inner|Outer) Rail", platform_name, re.I)

        # Most stations tell us whether they are -bound in a certain direction
        if direction:
            direction = capwords(direction.group(0))
        # Some Circle/Central Line platforms called "Inner" and "Outer" Rail, which make no sense to customers, so I've manually
        # entered Inner and Outer attributes in the object (taken from the database) in the attribute circular_directions,
        # which translate from these into North/South/East/West
        elif rail:
            direction = station.circular_directions[rail.group(1).lower()] + 'bound'
        else:
            # Some odd cases. Chesham and Chalfont & Latimer don't say anything at all for the platforms on the Chesham branch of the Met Line
            if station.code == "CHM":
                direction = "Southbound"
            elif station.code == "CLF" and platform.attrib['Num'] == '3':
                direction = "Northbound"
            else:
                # The following stations will have "issues" with bidrectional platforms: North Acton, Edgware Road, Loughton, White City
                # These are dealt with by analysing the location of the destination by the calling WhensMyTrain object
                direction = "Unknown"
                logging.debug("Have encountered a platform without direction specified (%s)", platform_name)

        # Use the filter function to filter out trains that are out of service, specials or National Rail first
        platform_trains = platform.findall("T[@LN='%s']" % line_code)
        platform_trains = [train for train in platform_trains if filter_tube_train(train)]
        for train in platform_trains:
            # Create a TubeTrain object
            destination = train.attrib['Destination']
            departure_delta = timedelta(seconds=int(train.attrib['SecondsTo']))
            departure_time = datetime.strftime(publication_time + departure_delta, "%H%M")
            set_number = train.attrib['SetNo']
            train_obj = TubeTrain(destination, direction, departure_time, line_code, set_number)
            trains_by_direction.add_to_slot(direction, train_obj)

    return trains_by_direction