コード例 #1
0
def fix_bus(line: str, route: str) -> Tuple[str, str]:
    """Fix the Bus lines and routes given by the original API.
    """
    with logger.contextualize(bus_line_original=line,
                              bus_route_original=route):
        logger.debug("Fixing bus line & route")

        # ROUTE: just fix chars
        route = fix_chars(route)

        # LINE:
        # Some routes have a letter that is part of the line in it, fix that:
        # Remove the letter from route and append to the end of the line instead
        for letter in LINE_LETTERS:
            if route.strip().startswith(letter):
                route = route.replace(letter, "")
                letter = letter.replace('"', "").replace(" ", "")
                line = line + letter
                break

        # Replace possible left double quote marks with simple quote marks
        # Remove asterisks on bus route
        line = line.replace('"', "'")
        route = route.replace('"', "'").replace("*", "")

        # Final strip on line and route
        line = line.strip()
        route = route.strip()

        logger.bind(bus_line_fixed=line,
                    bus_route_fixed=route).debug("Fixed bus line & route")
        return line, route
コード例 #2
0
def get_stop(stop_id: int) -> Optional[StopOrNotExist]:
    """Get a Stop from the Stops Cache.
    If the Stop does not exist and this was cached, StopNotExist exception is returned (not raised).
    If the Stop is not cached, None is returned.
    """
    stop = stops_cache.get(stop_id)
    logger.debug(f"Stop {'found' if stop else 'not found'} on local cache")
    return stop
コード例 #3
0
ファイル: html.py プロジェクト: Lodeiro0001/Python_VigoBusAPI
async def get_stop(stop_id: int) -> Stop:
    """Async function to get information of a Stop (only name) from the HTML data source.
    :param stop_id: Stop ID
    :raises: requests_async.Timeout | requests_async.RequestException |
             exceptions.StopNotExist | exceptions.exceptions.ParseError
    """
    logger.debug("Searching stop on external HTML data source")
    html_source = await request_html(stop_id)
    return parse_stop(html_source)
コード例 #4
0
async def read_stop(stop_id: int) -> OptionalStop:
    document = await get_collection(asyncio.get_event_loop()
                                    ).find_one({"_id": stop_id})

    if document:
        logger.bind(mongo_read_document_data=document).debug(
            "Read document from Mongo")
        return Stop(**document)
    else:
        logger.debug("No document found in Mongo")
コード例 #5
0
def fix_stop_name(name: str) -> str:
    """Fix the Stop names given by the original data sources.
    """
    with logger.contextualize(stop_name_original=name):
        logger.debug("Fixing stop name")

        # Remove double spaces
        name = re.sub(' +', ' ', name)

        # Replace - with commas
        name = name.replace("-", ",")

        # Force one space after each comma, remove unnecessary spaces before, remove duplicated commas
        name = name.replace(",", ", ").replace(" ,", ",").replace(", ,", ",")

        # Remove unnecessary commas just before parenthesis
        name = name.replace(", (", " (").replace(",(", " (")

        # Remove unnecessary dots after parenthesis
        name = name.replace(").", ")")

        # Remove unnecessary spaces after opening or before closing parenthesis
        name = name.replace("( ", "(").replace(") ", ")")

        # Capitalize each word on the name (if the word is at least 3 characters long);
        # Set prepositions to lowercase;
        # Fix chars
        name_words = fix_chars(name).split()
        for index, word in enumerate(name_words):
            # noinspection PyBroadException
            try:
                word = word.strip().lower()
                if word not in PREPOSITIONS:
                    if word.startswith("("):
                        char = word[1]
                        word = word.replace(char, char.upper())
                    else:
                        word = word.capitalize()
                name_words[index] = word

            except Exception:
                logger.opt(exception=True).bind(
                    word=word).warning("Error fixing word")

        name = ' '.join(name_words)

        # Turn roman numbers to uppercase
        name = ' '.join(word.upper() if is_roman(word) else word
                        for word in name.split())

        logger.bind(stop_name_fixed=name).debug("Fixed stop name")
        return name
コード例 #6
0
async def get_stop(stop_id: int) -> Stop:
    """Async function to get information of a Stop, using the STOP_GETTERS in order
    :param stop_id: Stop ID
    :raises: requests_async.Timeout | requests_async.RequestException |
             exceptions.StopNotExist | exceptions.ParseError
    """
    last_exception = None
    logger.debug(f"Getting stop {stop_id}")

    stop_getter: Callable
    for stop_getter in STOP_GETTERS:
        try:
            if inspect.iscoroutinefunction(stop_getter):
                stop: Stop = await stop_getter(stop_id)
            else:
                stop: StopOrNotExist = stop_getter(stop_id)

            if isinstance(stop, Exception):
                raise stop

        except StopNotExist as ex:
            last_exception = ex
            # Save the StopNotExist status in cache, if not found by the cache
            if STOP_GETTERS.index(stop_getter) > 0:
                cache.save_stop_not_exist(stop_id)
            break

        except Exception as ex:
            last_exception = ex

        else:
            if stop is not None:
                # Save the Stop on local data storages
                if STOP_GETTERS.index(stop_getter) > 0:
                    # Save the Stop in cache if not found by the cache
                    cache.save_stop(stop)
                if STOP_GETTERS.index(stop_getter) > 1:
                    # Save the Stop in MongoDB if not found by Mongo
                    add_stop_created_timestamp(stop)  # Add "created" field
                    await mongo.save_stop(stop)  # non-blocking

                # Add the Source to the returned data
                stop.source = get_package(stop_getter)

                return stop

    # If Stop not returned, raise the Last Exception
    raise last_exception
コード例 #7
0
def get_buses(stop_id: int, get_all_buses: bool) -> Optional[BusesResponse]:
    """Get List of Buses from the Buses Cache, by Stop ID and All Buses wanted (True/False).
    If the list of buses for the given Stop ID is not cached, None is returned.
    """
    buses_result: Optional[BusesResponse] = buses_cache.get((stop_id, get_all_buses))
    logger.debug(f"Buses {'found' if buses_result else 'not found'} on local cache")

    if buses_result is None and not get_all_buses:
        # If NOT All Buses are requested, and a Not All Buses query is not cached, but an All Buses query is cached,
        #  return it, since it is still valid - but limit the results
        buses_result = buses_cache.get((stop_id, True))
        if buses_result:
            buses_result.buses = buses_result.buses[:settings.buses_normal_limit]
            logger.debug(f"Buses from a getAllBuses=True request found on local cache, valid for this request")

    return buses_result
コード例 #8
0
async def get_buses(stop_id: int, get_all_buses: bool = False) -> BusesResponse:
    """Async function to get the buses incoming to a Stop from the HTML data source.
    The remote data source always returns the whole list of buses, but the output is shortened if get_all_buses=False.
    """
    logger.debug("Searching buses on external HTTP data source...")

    params = {"id": stop_id, "ttl": 5, "tipo": "TRANSPORTE-ESTIMACION-PARADA"}
    response = await http_request(
        url=ENDPOINT_URL,
        params=params
    )

    buses_response = parse_http_response(data=response.json(), get_all_buses=get_all_buses, verify_stop_exists=False)
    logger.bind(buses_response_data=buses_response.dict()).debug("Generated BusesResponse")

    return buses_response
コード例 #9
0
def parse_stop_exists(html_source: str, raise_exception: bool = True) -> bool:
    """Given the HTML source code returned by HTTP request (str), detect if the stop was found or not.
    Must be called at the beggining of parse_stop/parse_buses helpers.
    If raise_exception is True, exceptions.StopNotExist is raised if the stop not exists.
    Otherwise, return True if the stop exists, return False if not exists.
    :param html_source: HTML source code
    :param raise_exception: if True, raise StopNotExist if the stop not exists (default=True)
    :return: True if stop exists; False if stop not exists (only if raise_exception=False)
    :raises: exceptions.StopNotExist
    """
    exists = "Parada Inexistente" not in html_source

    if not exists:
        logger.debug("The stop does not exist")
        if raise_exception:
            raise StopNotExist()

    return exists
コード例 #10
0
def parse_pages(html_source: str) -> Tuple[int, int]:
    """Parse the pages on the current page, returning the current page number, and how many pages
    are available after the current one.
    :param html_source: HTML source code
    :return: [Current page number, Ammount of pages available after the current]
    :raises: vigobus_getters.exceptions.ParseError
    """
    with parsing():
        html = BeautifulSoup(html_source, HTML_PARSER)
        href_pages = set(
        )  # Pages with <a> tag, meaning they are not the current number

        # Table that contains the page numbers
        numbers_table = html.find(**PARSER_PAGE_NUMBERS_TABLE)

        # No table found = no more pages available
        if numbers_table is None:
            logger.debug("No extra pages found")
            # return: current page = 1; additional pages available = 0)
            return 1, 0

        # Current page inside that table
        current_page = int(
            numbers_table.find(**PARSER_PAGE_NUMBER_CURRENT_INSIDE_TABLE).text)

        # All the linked numbers inside that table
        linked_pages_inside_table = numbers_table.find_all(
            **PARSER_PAGE_NUMBERS_LINKED_INSIDE_TABLE)

        # Parse all the valued found numbers
        for page_html in linked_pages_inside_table:
            try:
                page = int(page_html.text.strip())
                href_pages.add(page)
            except ValueError:
                pass

        # Get how many pages are left after the current page
        pages_left = sum(1 for n in href_pages if n > current_page)

        logger.debug(
            f"Current page is {current_page}, with {pages_left} additional pages"
        )
        return current_page, pages_left
コード例 #11
0
async def insert_stops(*stops: Stop, catch_errors: bool = False) -> InsertManyResult:
    """Insert one or multiple Stops in Mongo, provided as a single object or multiple args (comma separated).
    Return the Mongo Result on completion.
    :param catch_errors: if True, log errors and avoid raising them (useful when called as async background task)
    """
    try:
        insert_data = [stop.get_mongo_dict() for stop in stops]

        with logger.contextualize(mongo_insert_data=insert_data):
            logger.debug("Inserting stops in Mongo")
            result: InsertManyResult = await get_collection(asyncio.get_event_loop()).insert_many(insert_data)

            logger.bind(mongo_inserted_ids=result.inserted_ids).debug("Inserted stops in Mongo")
            return result

    except Exception as ex:
        if not catch_errors:
            raise ex
        logger.opt(exception=True).bind(stops=stops).error("Error while saving stop/s in MongoDB")
コード例 #12
0
ファイル: html.py プロジェクト: Lodeiro0001/Python_VigoBusAPI
async def get_buses(stop_id: int, get_all_buses: bool = False) -> BusesResponse:
    """Async function to get the buses incoming on a Stop from the HTML data source.
    Return the List of Buses AND True if more bus pages available, False if the current bus list was the only page.
    :param stop_id: Stop ID
    :param get_all_buses: if True, get all Buses through all the HTML pages available
    :raises: requests_async.RequestTimeout | requests_async.RequestException |
             exceptions.StopNotExist | exceptions.exceptions.ParseError
    """
    logger.debug("Searching buses on first page of external HTML data source")
    html_source = await request_html(stop_id)

    buses = parse_buses(html_source)
    _, pages_available = parse_pages(html_source)
    more_buses_available = bool(pages_available)

    logger.bind(
        buses=buses,
        pages_available=pages_available,
        more_buses_available=more_buses_available
    ).debug(f"Parsed {len(buses)} buses on the first page")

    # Try to parse extra pages available, if any
    if get_all_buses and more_buses_available:
        logger.debug("Searching for more buses on next pages")
        # Get and Parse extra pages available
        extra_parameters = parse_extra_parameters(html_source)

        try:
            if not settings.buses_pages_async:
                for page in range(2, pages_available + 2):
                    with logger.contextualize(current_page=page, pages_available=pages_available):
                        logger.debug(f"Searching buses synchronously on page {page}")
                        html_source = await request_html(stop_id, page=page, extra_params=extra_parameters)

                        assert_page_number(html_source, page)
                        more_buses = parse_buses(html_source)
                        logger.bind(buses=more_buses).debug(f"Parsed {len(more_buses)} buses on page {page}")

                        buses.extend(more_buses)

            else:
                extra_pages_coros = [
                    request_html(stop_id, page=page, extra_params=extra_parameters)
                    for page in range(2, pages_available + 2)
                ]

                logger.debug(f"Searching buses asynchronously on {len(extra_pages_coros)} more pages")
                extra_pages_html_source: List[str] = await asyncio.gather(*extra_pages_coros)

                for page, page_html_source in enumerate(extra_pages_html_source, 2):
                    logger.debug(f"Parsing buses on page {page}")
                    assert_page_number(html_source=page_html_source, expected_current_page=page)

                    page_buses = parse_buses(page_html_source)
                    logger.bind(buses=page_buses).debug(f"Parsed {len(page_buses)} buses on page {page}")

                    buses.extend(page_buses)

        except (RequestException, *ParsingExceptions):
            # Ignore exceptions while iterating the pages
            # Keep & return the buses that could be fetched
            logger.opt(exception=True).error("Error while iterating pages")

        else:
            more_buses_available = False

    clear_duplicated_buses(buses)

    response = BusesResponse(
        buses=sorted(buses, key=lambda bus: (bus.time, bus.route)),
        more_buses_available=more_buses_available
    )

    logger.bind(buses_response_data=response.dict()).debug("Generated BusesResponse")
    return response
コード例 #13
0
def save_buses(stop_id: int, get_all_buses: bool, buses_result: BusesResponse):
    """This function must be executed whenever a List of Buses for a Stop is found by any getter,
    other than the Stops Cache
    """
    buses_cache[(stop_id, get_all_buses)] = buses_result
    logger.debug(f"Saved buses on local cache")
コード例 #14
0
async def request_html(stop_id: int, page: Optional[int] = None, extra_params: Optional[Dict] = None) -> str:
    """Async function to request the webpage data source, returning the HTML content.
    :param stop_id: Stop ID
    :param page: Page to retrieve (default=None, so first page)
    :param extra_params: Additional parameters required by the data source when asking for a certain page higher than 1
                         (__VIEWSTATE, __VIEWSTATEGENERATOR, __EVENTVALIDATION), as dict
    :raises: requests_async.RequestTimeout | requests_async.RequestException
    """
    # Generate params (Stop ID)
    params = {"parada": stop_id}

    # Extra params available = next pages, requiring body & updated headers
    if extra_params is not None:
        # Body/Data
        extra_params[EXTRA_DATA_PAGE] = page  # add the Page number to the extra_params
        body = EXTRA_DATA.format(**extra_params)  # format the request Body with the extra_params
        # Headers
        headers = copy.deepcopy(HEADERS)
        headers.update(HEADERS_NEXT_LOADS)  # update the original Headers with the extra items used on next pages
        headers[HEADERS_NEXT_LOADS_REFERER] = settings.html_remote_api + HEADERS_NEXT_LOADS_REFERER_PARAMS.format(
            stop_id=stop_id  # update the Referer header with the URL with the stop_id as parameter
        )
    # Extra params not available = this is the first page, body not required & use unmodified headers
    else:
        headers = HEADERS
        body = None

    # Getting first page is GET request, getting other pages is POST request
    method = get if page is None else post
    last_error = None

    # Run the Requests, with Retries support
    retries = settings.http_retries
    url = settings.html_remote_api
    timeout = settings.http_timeout

    for i in range(retries):
        with logger.contextualize(
                request_url=url,
                request_attempt=i+1,
                request_max_attempts=retries,
                request_params=params,
                request_body=body,
                request_headers=headers,
                request_timeout=timeout
        ):
            logger.debug("Requesting URL")

            try:
                start_time = time.time()
                response: Response = await method(
                    url=url,
                    params=params,
                    data=body,
                    headers=headers,
                    timeout=timeout
                )

                response_time = round(time.time() - start_time, 4)
                logger.bind(
                    response_elapsed_time=response_time,
                    response_status_code=response.status_code,
                    response_body=response.text
                ).debug("Response received")

                response.raise_for_status()
                return response.text

            except RequestException as ex:
                logger.warning("Request failed")
                last_error = ex

    raise last_error
コード例 #15
0
async def http_request(
        url: str,
        method: str = "GET",
        params: Optional[dict] = None,
        body: Optional[Union[dict, str]] = None,
        headers: Optional[dict] = None,
        timeout: float = settings.http_timeout,
        retries: int = settings.http_retries,
        raise_for_status: bool = True,
        not_retry_400_errors: bool = True
) -> Response:
    """Async function to perform a generic HTTP request, supporting retries

    :param url: URL to request
    :param method: HTTP method (default=GET)
    :param params: URL query params as dict (default=None)
    :param body: request body, usually a dict or string (default=None)
    :param headers: request headers as dict (default=None)
    :param timeout: timeout for each request retry in seconds (default=from settings)
    :param retries: how many times to retry the request if it fails (default=from settings)
    :param raise_for_status: if True, raise HTTPError if response is not successful (default=True)
    :param not_retry_400_errors: if True, do not retry requests failed with a ~400 status code (default=True)
    :return: the Response object
    :raises: requests_async.RequestTimeout | requests_async.RequestException
    """
    last_error = None
    last_status_code = None

    for i in range(retries):
        with logger.contextualize(
            request_url=url,
            request_method=method,
            request_attemp=i+1,
            request_max_attempts=retries,
            request_params=params,
            request_body=body,
            request_headers=headers,
            request_timeout=timeout
        ):
            logger.debug("Requesting URL...")

            try:
                start_time = time.time()
                response: Response = await request(
                    method=method,
                    url=url,
                    params=params,
                    data=body,
                    headers=headers,
                    timeout=timeout
                )

                response_time = round(time.time() - start_time, 4)
                last_status_code = response.status_code
                logger.bind(
                    response_elapsed_time=response_time,
                    response_status_code=last_status_code,
                    response_body=response.text
                ).debug("Response received")

                if raise_for_status:
                    response.raise_for_status()
                return response

            except RequestException as ex:
                if not_retry_400_errors and last_status_code and 400 <= last_status_code < 500:
                    logger.warning("Request failed due to 400 error, not going to retry")
                    break

                logger.warning("Request failed")
                last_error = ex

    raise last_error
コード例 #16
0
def save_stop(stop: Stop):
    """This function must be executed whenever a Stop is found by any getter, other than the Stops Cache
    """
    stops_cache[stop.stop_id] = stop
    logger.debug("Saved stop on local cache")
コード例 #17
0
def save_stop_not_exist(stop_id: int):
    """This function must be executed whenever an external data source reports that a Stop Not Exists
    """
    stops_cache[stop_id] = StopNotExist()
    logger.debug("Saved stop as non existing on local cache")