def fix_bus(line: str, route: str) -> Tuple[str, str]:
    """Fix the Bus lines and routes given by the original API.
    """
    with logger.contextualize(bus_line_original=line,
                              bus_route_original=route):
        logger.debug("Fixing bus line & route")

        # ROUTE: just fix chars
        route = fix_chars(route)

        # LINE:
        # Some routes have a letter that is part of the line in it, fix that:
        # Remove the letter from route and append to the end of the line instead
        for letter in LINE_LETTERS:
            if route.strip().startswith(letter):
                route = route.replace(letter, "")
                letter = letter.replace('"', "").replace(" ", "")
                line = line + letter
                break

        # Replace possible left double quote marks with simple quote marks
        # Remove asterisks on bus route
        line = line.replace('"', "'")
        route = route.replace('"', "'").replace("*", "")

        # Final strip on line and route
        line = line.strip()
        route = route.strip()

        logger.bind(bus_line_fixed=line,
                    bus_route_fixed=route).debug("Fixed bus line & route")
        return line, route
Beispiel #2
0
async def endpoint_get_buses(stop_id: int, get_all_buses: bool = False):
    """Endpoint to get a list of Buses coming to a Stop giving the Stop ID.
    By default the shortest available list of buses is returned, unless 'get_all_buses' param is True
    """
    with logger.contextualize(**locals()):
        buses_result = await get_buses(stop_id, get_all_buses=get_all_buses)
        return buses_result.dict()
def clear_duplicated_buses(buses: Buses) -> Buses:
    """Given a List of Buses, find possible duplicated bus and remove them.
    Buses can be duplicated when getting all the pages from the HTML data source,
    as changes on the list of buses can happen while fetching all the pages.

    If two (or more) buses have the same bus_id (same line-route) and same time, they are considered duplicates.
    Still there is a small change of having two duplicated buses, with a diff of +/- 1min, since the time can change
    between pages requested. However this is ignored by now, to reduce code complexity.

    Duplicated bus/es are removed from the list in-place, so the same object is returned.
    """
    with logger.contextualize(buses=buses):
        buses_start = len(buses)
        buses_ids_times = Counter()
        """Counter with tuples (bus_id, time)"""
        for bus in buses:
            buses_ids_times[(bus.bus_id, bus.time)] += 1

        for bus_id, time in [
                tup for tup, count in buses_ids_times.items() if count > 1
        ]:
            for i, repeated_bus in enumerate([
                    bus for bus in buses
                    if bus.bus_id == bus_id and bus.time == time
            ]):
                if i > 0:
                    buses.remove(repeated_bus)

        buses_diff = buses_start - len(buses)
        logger.bind(buses_diff=buses_diff).debug(
            f"Cleared {buses_diff} duplicated buses")

        return buses
def fix_stop_name(name: str) -> str:
    """Fix the Stop names given by the original data sources.
    """
    with logger.contextualize(stop_name_original=name):
        logger.debug("Fixing stop name")

        # Remove double spaces
        name = re.sub(' +', ' ', name)

        # Replace - with commas
        name = name.replace("-", ",")

        # Force one space after each comma, remove unnecessary spaces before, remove duplicated commas
        name = name.replace(",", ", ").replace(" ,", ",").replace(", ,", ",")

        # Remove unnecessary commas just before parenthesis
        name = name.replace(", (", " (").replace(",(", " (")

        # Remove unnecessary dots after parenthesis
        name = name.replace(").", ")")

        # Remove unnecessary spaces after opening or before closing parenthesis
        name = name.replace("( ", "(").replace(") ", ")")

        # Capitalize each word on the name (if the word is at least 3 characters long);
        # Set prepositions to lowercase;
        # Fix chars
        name_words = fix_chars(name).split()
        for index, word in enumerate(name_words):
            # noinspection PyBroadException
            try:
                word = word.strip().lower()
                if word not in PREPOSITIONS:
                    if word.startswith("("):
                        char = word[1]
                        word = word.replace(char, char.upper())
                    else:
                        word = word.capitalize()
                name_words[index] = word

            except Exception:
                logger.opt(exception=True).bind(
                    word=word).warning("Error fixing word")

        name = ' '.join(name_words)

        # Turn roman numbers to uppercase
        name = ' '.join(word.upper() if is_roman(word) else word
                        for word in name.split())

        logger.bind(stop_name_fixed=name).debug("Fixed stop name")
        return name
Beispiel #5
0
async def get_buses(stop_id: int, get_all_buses: bool) -> BusesResponse:
    """Async function to get information of a Stop, using the BUS_GETTERS in order
    :param stop_id: Stop ID
    :param get_all_buses: if True, fetch all the available buses
    :raises: requests_async.Timeout | requests_async.RequestException |
             exceptions.StopNotExist | exceptions.ParseError
    """
    last_exception = None

    # Lookup the Stop in cache; if available, verify that it exists
    cached_stop = cache.get_stop(stop_id)
    if isinstance(cached_stop, StopNotExist):
        raise cached_stop

    for bus_getter in BUS_GETTERS:
        getter_name = get_package(bus_getter)

        with logger.contextualize(buses_getter_name=getter_name):
            try:
                if inspect.iscoroutinefunction(bus_getter):
                    buses_result: Optional[BusesResponse] = await bus_getter(
                        stop_id, get_all_buses)
                else:
                    buses_result: Optional[BusesResponse] = bus_getter(
                        stop_id, get_all_buses)

            except StopNotExist as ex:
                last_exception = ex
                break

            except Exception as ex:
                logger.opt(exception=True).warning("Error on Buses getter")
                last_exception = ex

            else:
                if buses_result is not None:
                    if BUS_GETTERS.index(bus_getter) > 0:
                        # Save the Buses in cache if bus list not found by the cache itself
                        cache.save_buses(stop_id, get_all_buses, buses_result)

                    # Add the source to the returned data
                    buses_result.source = getter_name

                    return buses_result

    # If Buses not returned, raise the Last Exception
    raise last_exception
async def insert_stops(*stops: Stop, catch_errors: bool = False) -> InsertManyResult:
    """Insert one or multiple Stops in Mongo, provided as a single object or multiple args (comma separated).
    Return the Mongo Result on completion.
    :param catch_errors: if True, log errors and avoid raising them (useful when called as async background task)
    """
    try:
        insert_data = [stop.get_mongo_dict() for stop in stops]

        with logger.contextualize(mongo_insert_data=insert_data):
            logger.debug("Inserting stops in Mongo")
            result: InsertManyResult = await get_collection(asyncio.get_event_loop()).insert_many(insert_data)

            logger.bind(mongo_inserted_ids=result.inserted_ids).debug("Inserted stops in Mongo")
            return result

    except Exception as ex:
        if not catch_errors:
            raise ex
        logger.opt(exception=True).bind(stops=stops).error("Error while saving stop/s in MongoDB")
Beispiel #7
0
async def request_handler(request: Request, call_next):
    """Middleware used on FastAPI to process each request, for error & log handling
    """
    url = str(request.url)
    if url.endswith("/favicon.ico"):
        return Response(status_code=404)

    request_id = str(uuid4())
    with logger.contextualize(request_id=request_id, url=url):
        start_time = time.time()

        # noinspection PyBroadException
        try:
            logger.info("Request started")
            return await asyncio.wait_for(call_next(request),
                                          timeout=settings.endpoint_timeout)

        except Exception as exception:
            return handle_exception(exception)

        finally:
            process_time = round(time.time() - start_time, ndigits=5)
            logger.bind(last_record=True, process_time=process_time).info(
                f"Request ended in {process_time} seconds")
Beispiel #8
0
async def endpoint_get_stop(stop_id: int):
    """Endpoint to get information of a Stop giving the Stop ID
    """
    with logger.contextualize(**locals()):
        stop = await get_stop(stop_id)
        return stop.dict()
Beispiel #9
0
async def endpoint_get_stops(stop_name: str, limit: Optional[int] = None):
    """Endpoint to search stops by a given name
    """
    with logger.contextualize(**locals()):
        stops = await search_stops(stop_name=stop_name, limit=limit)
        return [stop.dict() for stop in stops]
Beispiel #10
0
async def get_buses(stop_id: int, get_all_buses: bool = False) -> BusesResponse:
    """Async function to get the buses incoming on a Stop from the HTML data source.
    Return the List of Buses AND True if more bus pages available, False if the current bus list was the only page.
    :param stop_id: Stop ID
    :param get_all_buses: if True, get all Buses through all the HTML pages available
    :raises: requests_async.RequestTimeout | requests_async.RequestException |
             exceptions.StopNotExist | exceptions.exceptions.ParseError
    """
    logger.debug("Searching buses on first page of external HTML data source")
    html_source = await request_html(stop_id)

    buses = parse_buses(html_source)
    _, pages_available = parse_pages(html_source)
    more_buses_available = bool(pages_available)

    logger.bind(
        buses=buses,
        pages_available=pages_available,
        more_buses_available=more_buses_available
    ).debug(f"Parsed {len(buses)} buses on the first page")

    # Try to parse extra pages available, if any
    if get_all_buses and more_buses_available:
        logger.debug("Searching for more buses on next pages")
        # Get and Parse extra pages available
        extra_parameters = parse_extra_parameters(html_source)

        try:
            if not settings.buses_pages_async:
                for page in range(2, pages_available + 2):
                    with logger.contextualize(current_page=page, pages_available=pages_available):
                        logger.debug(f"Searching buses synchronously on page {page}")
                        html_source = await request_html(stop_id, page=page, extra_params=extra_parameters)

                        assert_page_number(html_source, page)
                        more_buses = parse_buses(html_source)
                        logger.bind(buses=more_buses).debug(f"Parsed {len(more_buses)} buses on page {page}")

                        buses.extend(more_buses)

            else:
                extra_pages_coros = [
                    request_html(stop_id, page=page, extra_params=extra_parameters)
                    for page in range(2, pages_available + 2)
                ]

                logger.debug(f"Searching buses asynchronously on {len(extra_pages_coros)} more pages")
                extra_pages_html_source: List[str] = await asyncio.gather(*extra_pages_coros)

                for page, page_html_source in enumerate(extra_pages_html_source, 2):
                    logger.debug(f"Parsing buses on page {page}")
                    assert_page_number(html_source=page_html_source, expected_current_page=page)

                    page_buses = parse_buses(page_html_source)
                    logger.bind(buses=page_buses).debug(f"Parsed {len(page_buses)} buses on page {page}")

                    buses.extend(page_buses)

        except (RequestException, *ParsingExceptions):
            # Ignore exceptions while iterating the pages
            # Keep & return the buses that could be fetched
            logger.opt(exception=True).error("Error while iterating pages")

        else:
            more_buses_available = False

    clear_duplicated_buses(buses)

    response = BusesResponse(
        buses=sorted(buses, key=lambda bus: (bus.time, bus.route)),
        more_buses_available=more_buses_available
    )

    logger.bind(buses_response_data=response.dict()).debug("Generated BusesResponse")
    return response
async def http_request(
        url: str,
        method: str = "GET",
        params: Optional[dict] = None,
        body: Optional[Union[dict, str]] = None,
        headers: Optional[dict] = None,
        timeout: float = settings.http_timeout,
        retries: int = settings.http_retries,
        raise_for_status: bool = True,
        not_retry_400_errors: bool = True
) -> Response:
    """Async function to perform a generic HTTP request, supporting retries

    :param url: URL to request
    :param method: HTTP method (default=GET)
    :param params: URL query params as dict (default=None)
    :param body: request body, usually a dict or string (default=None)
    :param headers: request headers as dict (default=None)
    :param timeout: timeout for each request retry in seconds (default=from settings)
    :param retries: how many times to retry the request if it fails (default=from settings)
    :param raise_for_status: if True, raise HTTPError if response is not successful (default=True)
    :param not_retry_400_errors: if True, do not retry requests failed with a ~400 status code (default=True)
    :return: the Response object
    :raises: requests_async.RequestTimeout | requests_async.RequestException
    """
    last_error = None
    last_status_code = None

    for i in range(retries):
        with logger.contextualize(
            request_url=url,
            request_method=method,
            request_attemp=i+1,
            request_max_attempts=retries,
            request_params=params,
            request_body=body,
            request_headers=headers,
            request_timeout=timeout
        ):
            logger.debug("Requesting URL...")

            try:
                start_time = time.time()
                response: Response = await request(
                    method=method,
                    url=url,
                    params=params,
                    data=body,
                    headers=headers,
                    timeout=timeout
                )

                response_time = round(time.time() - start_time, 4)
                last_status_code = response.status_code
                logger.bind(
                    response_elapsed_time=response_time,
                    response_status_code=last_status_code,
                    response_body=response.text
                ).debug("Response received")

                if raise_for_status:
                    response.raise_for_status()
                return response

            except RequestException as ex:
                if not_retry_400_errors and last_status_code and 400 <= last_status_code < 500:
                    logger.warning("Request failed due to 400 error, not going to retry")
                    break

                logger.warning("Request failed")
                last_error = ex

    raise last_error
async def request_html(stop_id: int, page: Optional[int] = None, extra_params: Optional[Dict] = None) -> str:
    """Async function to request the webpage data source, returning the HTML content.
    :param stop_id: Stop ID
    :param page: Page to retrieve (default=None, so first page)
    :param extra_params: Additional parameters required by the data source when asking for a certain page higher than 1
                         (__VIEWSTATE, __VIEWSTATEGENERATOR, __EVENTVALIDATION), as dict
    :raises: requests_async.RequestTimeout | requests_async.RequestException
    """
    # Generate params (Stop ID)
    params = {"parada": stop_id}

    # Extra params available = next pages, requiring body & updated headers
    if extra_params is not None:
        # Body/Data
        extra_params[EXTRA_DATA_PAGE] = page  # add the Page number to the extra_params
        body = EXTRA_DATA.format(**extra_params)  # format the request Body with the extra_params
        # Headers
        headers = copy.deepcopy(HEADERS)
        headers.update(HEADERS_NEXT_LOADS)  # update the original Headers with the extra items used on next pages
        headers[HEADERS_NEXT_LOADS_REFERER] = settings.html_remote_api + HEADERS_NEXT_LOADS_REFERER_PARAMS.format(
            stop_id=stop_id  # update the Referer header with the URL with the stop_id as parameter
        )
    # Extra params not available = this is the first page, body not required & use unmodified headers
    else:
        headers = HEADERS
        body = None

    # Getting first page is GET request, getting other pages is POST request
    method = get if page is None else post
    last_error = None

    # Run the Requests, with Retries support
    retries = settings.http_retries
    url = settings.html_remote_api
    timeout = settings.http_timeout

    for i in range(retries):
        with logger.contextualize(
                request_url=url,
                request_attempt=i+1,
                request_max_attempts=retries,
                request_params=params,
                request_body=body,
                request_headers=headers,
                request_timeout=timeout
        ):
            logger.debug("Requesting URL")

            try:
                start_time = time.time()
                response: Response = await method(
                    url=url,
                    params=params,
                    data=body,
                    headers=headers,
                    timeout=timeout
                )

                response_time = round(time.time() - start_time, 4)
                logger.bind(
                    response_elapsed_time=response_time,
                    response_status_code=response.status_code,
                    response_body=response.text
                ).debug("Response received")

                response.raise_for_status()
                return response.text

            except RequestException as ex:
                logger.warning("Request failed")
                last_error = ex

    raise last_error