def fix_bus(line: str, route: str) -> Tuple[str, str]:
    """Fix the Bus lines and routes given by the original API.
    """
    with logger.contextualize(bus_line_original=line,
                              bus_route_original=route):
        logger.debug("Fixing bus line & route")

        # ROUTE: just fix chars
        route = fix_chars(route)

        # LINE:
        # Some routes have a letter that is part of the line in it, fix that:
        # Remove the letter from route and append to the end of the line instead
        for letter in LINE_LETTERS:
            if route.strip().startswith(letter):
                route = route.replace(letter, "")
                letter = letter.replace('"', "").replace(" ", "")
                line = line + letter
                break

        # Replace possible left double quote marks with simple quote marks
        # Remove asterisks on bus route
        line = line.replace('"', "'")
        route = route.replace('"', "'").replace("*", "")

        # Final strip on line and route
        line = line.strip()
        route = route.strip()

        logger.bind(bus_line_fixed=line,
                    bus_route_fixed=route).debug("Fixed bus line & route")
        return line, route
def parse_buses(html_source: str) -> Buses:
    """Parse the HTML content returned after requesting the HTML data source, and parse the Stop info and List of buses.
    :param html_source: HTML source code as string
    :return: List of buses
    :raises: exceptions.StopNotExist | exceptions.exceptions.ParseError
    """
    parse_stop_exists(html_source)
    buses = list()
    html = BeautifulSoup(html_source, HTML_PARSER)

    with parsing():
        buses_table = html.find(**PARSER_BUSES_TABLE)
        # If buses_table is not found, means no buses are available
        if buses_table:
            buses_rows = list()
            for parser in PARSERS_BUSES_ROWS_INSIDE_TABLE:
                buses_rows.extend(buses_table.find_all(**parser))

            for row in buses_rows:
                bus_data_columns = row.find_all("td")

                if len(
                        bus_data_columns
                ) == 3:  # The header is a row but without <td>; <th> instead
                    line = bus_data_columns[0].text.replace(" ", "")
                    route = bus_data_columns[1].text.strip()
                    time = int(bus_data_columns[2].text)
                    line, route = fix_bus(line, route)
                    buses.append(Bus(line=line, route=route, time=time))

        logger.bind(buses=buses).debug(f"Parsed {len(buses)} buses")
        return buses
def clear_duplicated_buses(buses: Buses) -> Buses:
    """Given a List of Buses, find possible duplicated bus and remove them.
    Buses can be duplicated when getting all the pages from the HTML data source,
    as changes on the list of buses can happen while fetching all the pages.

    If two (or more) buses have the same bus_id (same line-route) and same time, they are considered duplicates.
    Still there is a small change of having two duplicated buses, with a diff of +/- 1min, since the time can change
    between pages requested. However this is ignored by now, to reduce code complexity.

    Duplicated bus/es are removed from the list in-place, so the same object is returned.
    """
    with logger.contextualize(buses=buses):
        buses_start = len(buses)
        buses_ids_times = Counter()
        """Counter with tuples (bus_id, time)"""
        for bus in buses:
            buses_ids_times[(bus.bus_id, bus.time)] += 1

        for bus_id, time in [
                tup for tup, count in buses_ids_times.items() if count > 1
        ]:
            for i, repeated_bus in enumerate([
                    bus for bus in buses
                    if bus.bus_id == bus_id and bus.time == time
            ]):
                if i > 0:
                    buses.remove(repeated_bus)

        buses_diff = buses_start - len(buses)
        logger.bind(buses_diff=buses_diff).debug(
            f"Cleared {buses_diff} duplicated buses")

        return buses
async def read_stop(stop_id: int) -> OptionalStop:
    document = await get_collection(asyncio.get_event_loop()
                                    ).find_one({"_id": stop_id})

    if document:
        logger.bind(mongo_read_document_data=document).debug(
            "Read document from Mongo")
        return Stop(**document)
    else:
        logger.debug("No document found in Mongo")
def fix_stop_name(name: str) -> str:
    """Fix the Stop names given by the original data sources.
    """
    with logger.contextualize(stop_name_original=name):
        logger.debug("Fixing stop name")

        # Remove double spaces
        name = re.sub(' +', ' ', name)

        # Replace - with commas
        name = name.replace("-", ",")

        # Force one space after each comma, remove unnecessary spaces before, remove duplicated commas
        name = name.replace(",", ", ").replace(" ,", ",").replace(", ,", ",")

        # Remove unnecessary commas just before parenthesis
        name = name.replace(", (", " (").replace(",(", " (")

        # Remove unnecessary dots after parenthesis
        name = name.replace(").", ")")

        # Remove unnecessary spaces after opening or before closing parenthesis
        name = name.replace("( ", "(").replace(") ", ")")

        # Capitalize each word on the name (if the word is at least 3 characters long);
        # Set prepositions to lowercase;
        # Fix chars
        name_words = fix_chars(name).split()
        for index, word in enumerate(name_words):
            # noinspection PyBroadException
            try:
                word = word.strip().lower()
                if word not in PREPOSITIONS:
                    if word.startswith("("):
                        char = word[1]
                        word = word.replace(char, char.upper())
                    else:
                        word = word.capitalize()
                name_words[index] = word

            except Exception:
                logger.opt(exception=True).bind(
                    word=word).warning("Error fixing word")

        name = ' '.join(name_words)

        # Turn roman numbers to uppercase
        name = ' '.join(word.upper() if is_roman(word) else word
                        for word in name.split())

        logger.bind(stop_name_fixed=name).debug("Fixed stop name")
        return name
async def search_stops(stop_name: str, limit: Optional[int] = None) -> Stops:
    documents = list()
    cursor: AsyncIOMotorCursor = get_collection(asyncio.get_event_loop()).find(
        {"$text": {
            "$search": stop_name
        }})

    if limit is not None:
        cursor = cursor.limit(limit)

    async for document in cursor:
        documents.append(document)

    logger.bind(mongo_read_documents_data=documents).debug(
        f"Search in Mongo returned {len(documents)} documents")
    return [Stop(**document) for document in documents]
Beispiel #7
0
async def get_buses(stop_id: int, get_all_buses: bool = False) -> BusesResponse:
    """Async function to get the buses incoming to a Stop from the HTML data source.
    The remote data source always returns the whole list of buses, but the output is shortened if get_all_buses=False.
    """
    logger.debug("Searching buses on external HTTP data source...")

    params = {"id": stop_id, "ttl": 5, "tipo": "TRANSPORTE-ESTIMACION-PARADA"}
    response = await http_request(
        url=ENDPOINT_URL,
        params=params
    )

    buses_response = parse_http_response(data=response.json(), get_all_buses=get_all_buses, verify_stop_exists=False)
    logger.bind(buses_response_data=buses_response.dict()).debug("Generated BusesResponse")

    return buses_response
def parse_extra_parameters(html_source: str) -> Dict:
    """Parse the Extra parameters (__VIEWSTATE, __VIEWSTATEGENERATOR, __EVENTVALIDATION)
    required to fetch more pages, and return them as a Dict.
    :param html_source: HTML source code
    :return: Dict with the extra parameters
    :raises: vigobus_getters.exceptions.ParseError
    """
    with parsing():
        html = BeautifulSoup(html_source, HTML_PARSER)

        params = {key: None for key in EXTRA_DATA_REQUIRED}
        for key in params.keys():
            value = html.find("input", {"id": key})["value"]
            # Values must be URL-Parsed (e.g. replace '/' by '%2F' - https://www.urlencoder.io/python/)
            params[key] = urllib.parse.quote(value, safe="")

        logger.bind(extra_parameters=params).debug("Parsed extra parameters")
        return params
async def insert_stops(*stops: Stop, catch_errors: bool = False) -> InsertManyResult:
    """Insert one or multiple Stops in Mongo, provided as a single object or multiple args (comma separated).
    Return the Mongo Result on completion.
    :param catch_errors: if True, log errors and avoid raising them (useful when called as async background task)
    """
    try:
        insert_data = [stop.get_mongo_dict() for stop in stops]

        with logger.contextualize(mongo_insert_data=insert_data):
            logger.debug("Inserting stops in Mongo")
            result: InsertManyResult = await get_collection(asyncio.get_event_loop()).insert_many(insert_data)

            logger.bind(mongo_inserted_ids=result.inserted_ids).debug("Inserted stops in Mongo")
            return result

    except Exception as ex:
        if not catch_errors:
            raise ex
        logger.opt(exception=True).bind(stops=stops).error("Error while saving stop/s in MongoDB")
def parse_stop(html_source: str) -> Stop:
    """Parse the HTML content returned after requesting the HTML data source,
    parsing parse the Stop info and returning a Stop object.
    :param html_source: HTML source code as string
    :raises: exceptions.StopNotExist | exceptions.exceptions.ParseError
    """
    parse_stop_exists(html_source)
    html = BeautifulSoup(html_source, HTML_PARSER)
    # TODO BeautifulSoup-parsed object should be passed instead of raw HTML string

    with parsing():
        stop_id = int(html.find(**PARSER_STOP_ID).text)
        stop_original_name = html.find(**PARSER_STOP_NAME).text
        if not stop_original_name:
            raise ParseError("Parsed Stop Name is empty")
        stop_name = fix_stop_name(stop_original_name)

        stop = Stop(stop_id=stop_id,
                    name=stop_name,
                    original_name=stop_original_name)
        logger.bind(stop_data=stop.dict()).debug("Parsed stop")
        return stop
Beispiel #11
0
async def request_handler(request: Request, call_next):
    """Middleware used on FastAPI to process each request, for error & log handling
    """
    url = str(request.url)
    if url.endswith("/favicon.ico"):
        return Response(status_code=404)

    request_id = str(uuid4())
    with logger.contextualize(request_id=request_id, url=url):
        start_time = time.time()

        # noinspection PyBroadException
        try:
            logger.info("Request started")
            return await asyncio.wait_for(call_next(request),
                                          timeout=settings.endpoint_timeout)

        except Exception as exception:
            return handle_exception(exception)

        finally:
            process_time = round(time.time() - start_time, ndigits=5)
            logger.bind(last_record=True, process_time=process_time).info(
                f"Request ended in {process_time} seconds")
Beispiel #12
0
async def get_buses(stop_id: int, get_all_buses: bool = False) -> BusesResponse:
    """Async function to get the buses incoming on a Stop from the HTML data source.
    Return the List of Buses AND True if more bus pages available, False if the current bus list was the only page.
    :param stop_id: Stop ID
    :param get_all_buses: if True, get all Buses through all the HTML pages available
    :raises: requests_async.RequestTimeout | requests_async.RequestException |
             exceptions.StopNotExist | exceptions.exceptions.ParseError
    """
    logger.debug("Searching buses on first page of external HTML data source")
    html_source = await request_html(stop_id)

    buses = parse_buses(html_source)
    _, pages_available = parse_pages(html_source)
    more_buses_available = bool(pages_available)

    logger.bind(
        buses=buses,
        pages_available=pages_available,
        more_buses_available=more_buses_available
    ).debug(f"Parsed {len(buses)} buses on the first page")

    # Try to parse extra pages available, if any
    if get_all_buses and more_buses_available:
        logger.debug("Searching for more buses on next pages")
        # Get and Parse extra pages available
        extra_parameters = parse_extra_parameters(html_source)

        try:
            if not settings.buses_pages_async:
                for page in range(2, pages_available + 2):
                    with logger.contextualize(current_page=page, pages_available=pages_available):
                        logger.debug(f"Searching buses synchronously on page {page}")
                        html_source = await request_html(stop_id, page=page, extra_params=extra_parameters)

                        assert_page_number(html_source, page)
                        more_buses = parse_buses(html_source)
                        logger.bind(buses=more_buses).debug(f"Parsed {len(more_buses)} buses on page {page}")

                        buses.extend(more_buses)

            else:
                extra_pages_coros = [
                    request_html(stop_id, page=page, extra_params=extra_parameters)
                    for page in range(2, pages_available + 2)
                ]

                logger.debug(f"Searching buses asynchronously on {len(extra_pages_coros)} more pages")
                extra_pages_html_source: List[str] = await asyncio.gather(*extra_pages_coros)

                for page, page_html_source in enumerate(extra_pages_html_source, 2):
                    logger.debug(f"Parsing buses on page {page}")
                    assert_page_number(html_source=page_html_source, expected_current_page=page)

                    page_buses = parse_buses(page_html_source)
                    logger.bind(buses=page_buses).debug(f"Parsed {len(page_buses)} buses on page {page}")

                    buses.extend(page_buses)

        except (RequestException, *ParsingExceptions):
            # Ignore exceptions while iterating the pages
            # Keep & return the buses that could be fetched
            logger.opt(exception=True).error("Error while iterating pages")

        else:
            more_buses_available = False

    clear_duplicated_buses(buses)

    response = BusesResponse(
        buses=sorted(buses, key=lambda bus: (bus.time, bus.route)),
        more_buses_available=more_buses_available
    )

    logger.bind(buses_response_data=response.dict()).debug("Generated BusesResponse")
    return response
async def http_request(
        url: str,
        method: str = "GET",
        params: Optional[dict] = None,
        body: Optional[Union[dict, str]] = None,
        headers: Optional[dict] = None,
        timeout: float = settings.http_timeout,
        retries: int = settings.http_retries,
        raise_for_status: bool = True,
        not_retry_400_errors: bool = True
) -> Response:
    """Async function to perform a generic HTTP request, supporting retries

    :param url: URL to request
    :param method: HTTP method (default=GET)
    :param params: URL query params as dict (default=None)
    :param body: request body, usually a dict or string (default=None)
    :param headers: request headers as dict (default=None)
    :param timeout: timeout for each request retry in seconds (default=from settings)
    :param retries: how many times to retry the request if it fails (default=from settings)
    :param raise_for_status: if True, raise HTTPError if response is not successful (default=True)
    :param not_retry_400_errors: if True, do not retry requests failed with a ~400 status code (default=True)
    :return: the Response object
    :raises: requests_async.RequestTimeout | requests_async.RequestException
    """
    last_error = None
    last_status_code = None

    for i in range(retries):
        with logger.contextualize(
            request_url=url,
            request_method=method,
            request_attemp=i+1,
            request_max_attempts=retries,
            request_params=params,
            request_body=body,
            request_headers=headers,
            request_timeout=timeout
        ):
            logger.debug("Requesting URL...")

            try:
                start_time = time.time()
                response: Response = await request(
                    method=method,
                    url=url,
                    params=params,
                    data=body,
                    headers=headers,
                    timeout=timeout
                )

                response_time = round(time.time() - start_time, 4)
                last_status_code = response.status_code
                logger.bind(
                    response_elapsed_time=response_time,
                    response_status_code=last_status_code,
                    response_body=response.text
                ).debug("Response received")

                if raise_for_status:
                    response.raise_for_status()
                return response

            except RequestException as ex:
                if not_retry_400_errors and last_status_code and 400 <= last_status_code < 500:
                    logger.warning("Request failed due to 400 error, not going to retry")
                    break

                logger.warning("Request failed")
                last_error = ex

    raise last_error
async def request_html(stop_id: int, page: Optional[int] = None, extra_params: Optional[Dict] = None) -> str:
    """Async function to request the webpage data source, returning the HTML content.
    :param stop_id: Stop ID
    :param page: Page to retrieve (default=None, so first page)
    :param extra_params: Additional parameters required by the data source when asking for a certain page higher than 1
                         (__VIEWSTATE, __VIEWSTATEGENERATOR, __EVENTVALIDATION), as dict
    :raises: requests_async.RequestTimeout | requests_async.RequestException
    """
    # Generate params (Stop ID)
    params = {"parada": stop_id}

    # Extra params available = next pages, requiring body & updated headers
    if extra_params is not None:
        # Body/Data
        extra_params[EXTRA_DATA_PAGE] = page  # add the Page number to the extra_params
        body = EXTRA_DATA.format(**extra_params)  # format the request Body with the extra_params
        # Headers
        headers = copy.deepcopy(HEADERS)
        headers.update(HEADERS_NEXT_LOADS)  # update the original Headers with the extra items used on next pages
        headers[HEADERS_NEXT_LOADS_REFERER] = settings.html_remote_api + HEADERS_NEXT_LOADS_REFERER_PARAMS.format(
            stop_id=stop_id  # update the Referer header with the URL with the stop_id as parameter
        )
    # Extra params not available = this is the first page, body not required & use unmodified headers
    else:
        headers = HEADERS
        body = None

    # Getting first page is GET request, getting other pages is POST request
    method = get if page is None else post
    last_error = None

    # Run the Requests, with Retries support
    retries = settings.http_retries
    url = settings.html_remote_api
    timeout = settings.http_timeout

    for i in range(retries):
        with logger.contextualize(
                request_url=url,
                request_attempt=i+1,
                request_max_attempts=retries,
                request_params=params,
                request_body=body,
                request_headers=headers,
                request_timeout=timeout
        ):
            logger.debug("Requesting URL")

            try:
                start_time = time.time()
                response: Response = await method(
                    url=url,
                    params=params,
                    data=body,
                    headers=headers,
                    timeout=timeout
                )

                response_time = round(time.time() - start_time, 4)
                logger.bind(
                    response_elapsed_time=response_time,
                    response_status_code=response.status_code,
                    response_body=response.text
                ).debug("Response received")

                response.raise_for_status()
                return response.text

            except RequestException as ex:
                logger.warning("Request failed")
                last_error = ex

    raise last_error