Ejemplo n.º 1
0
def test_is_before(d1, d2, expected):
    """Ensure a d1 is before d2."""
    assert date_utils.is_before(d1, d2) == expected
Ejemplo n.º 2
0
async def async_retrieve(pages=-1, from_=None, to=None, attempts=1, backoff=1):
    """
    Retrieve fatality data.

    :param str pages: number of pages to retrieve or -1 for all
    :param str from_: the start date
    :param str to: the end date
    :return: the list of fatalities and the number of pages that were read.
    :rtype: tuple
    """
    res = {}
    page = 1
    has_entries = False
    no_date_within_range_count = 0
    from_date = date_utils.from_date(from_)
    to_date = date_utils.to_date(to)

    logger.debug(f'Retrieving fatalities from {from_date} to {to_date}.')

    async with aiohttp.ClientSession() as session:
        while True:
            # Fetch the news page.
            logger.info(f'Fetching page {page}...')
            try:
                news_page = await fetch_news_page(session, page)
            except Exception:
                raise ValueError(f'Cannot retrieve news page #{page}.')

            # Looks for traffic fatality links.
            page_details_links = extract_traffic_fatalities_page_details_link(news_page)

            # Generate the full URL for the links.
            links = generate_detail_page_urls(page_details_links)
            logger.debug(f'{len(links)} fatality page(s) to process.')

            # Fetch and parse each link.
            tasks = [
                fetch_and_parse.retry_with(
                    stop=stop_after_attempt(attempts),
                    wait=wait_exponential(multiplier=backoff),
                    reraise=True,
                )(session, link) for link in links
            ]
            page_res = await asyncio.gather(*tasks)

            if page_res:
                page_res = [person for item in page_res for person in item]
                # If the page contains fatalities, ensure all of them happened within the specified time range.
                entries_in_time_range = [
                    entry for entry in page_res if date_utils.is_between(entry[Fields.DATE], from_date, to_date)
                ]

                # If 2 pages in a row:
                #   1) contain results
                #   2) but none of them contain dates within the time range
                #   3) and we did not collect any valid entries
                # Then we can stop the operation.
                past_entries = all([date_utils.is_before(entry[Fields.DATE], from_date) for entry in page_res])
                if from_ and past_entries and not has_entries:
                    no_date_within_range_count += 1
                if no_date_within_range_count > 1:
                    logger.debug(f'{len(entries_in_time_range)} fatality page(s) within the specified time range.')
                    break

                # Check whether we found entries in the previous pages.
                if not has_entries:
                    has_entries = not has_entries and bool(entries_in_time_range)
                logger.debug(f'{len(entries_in_time_range)} fatality page(s) is/are within the specified time range.')

                # If there are none in range, we do not need to search further, and we can discard the results.
                if has_entries and not entries_in_time_range:
                    logger.debug(f'There are no data within the specified time range on page {page}.')
                    break

                # Store the results if the ID number is new.
                res.update(
                    {entry.get(Fields.ID): entry
                     for entry in entries_in_time_range if entry.get(Fields.ID) not in res})

            # Stop if there is no further pages.
            if not has_next(news_page) or page >= pages > 0:
                break

            page += 1

    return list(res.values()), page
Ejemplo n.º 3
0
async def async_retrieve(pages=-1, from_=None, to=None):
    """Retrieve fatality data."""
    res = {}
    page = 1
    has_entries = False
    no_date_within_range_count = 0

    logger.debug(
        f'Retrieving fatalities from {date_utils.from_date(from_)} to {date_utils.to_date(to)}.'
    )

    async with aiohttp.ClientSession() as session:
        while True:
            # Fetch the news page.
            logger.info(f'Fetching page {page}...')
            try:
                news_page = await fetch_news_page(session, page)
            except Exception:
                raise ValueError(f'Cannot retrieve news page #{page}.')

            # Looks for traffic fatality links.
            page_details_links = extract_traffic_fatalities_page_details_link(
                news_page)

            # Generate the full URL for the links.
            links = generate_detail_page_urls(page_details_links)
            logger.debug(f'{len(links)} fatality page(s) to process.')

            # Fetch and parse each link.
            tasks = [fetch_and_parse(session, link) for link in links]
            page_res = await asyncio.gather(*tasks)

            # If the page contains fatalities, ensure all of them happened within the specified time range.
            if page_res:
                entries_in_time_range = [
                    entry for entry in page_res
                    if date_utils.is_between(entry[Fields.DATE], from_, to)
                ]

                # If 2 pages in a row:
                #   1) contain results
                #   2) but none of them contain dates within the time range
                #   3) and we did not collect any valid entries
                # Then we can stop the operation.
                if from_ and all([
                        date_utils.is_before(entry[Fields.DATE], from_)
                        for entry in page_res
                ]) and not has_entries:
                    no_date_within_range_count += 1
                if no_date_within_range_count > 1:
                    logger.debug(
                        f'{len(entries_in_time_range)} fatality page(s) within the specified time range.'
                    )
                    break

                # Check whether we found entries in the previous pages.
                if not has_entries:
                    has_entries = not has_entries and bool(
                        entries_in_time_range)
                logger.debug(
                    f'{len(entries_in_time_range)} fatality page(s) is/are within the specified time range.'
                )

                # If there are none in range, we do not need to search further, and we can discard the results.
                if has_entries and not entries_in_time_range:
                    logger.debug(
                        f'There are no data within the specified time range on page {page}.'
                    )
                    break

                # Store the results if the case number is new.
                res.update({
                    entry.get(Fields.CASE): entry
                    for entry in entries_in_time_range
                    if entry.get(Fields.CASE) not in res
                })

            # Stop if there is no further pages.
            if not has_next(news_page) or page >= pages > 0:
                break

            page += 1

    return list(res.values()), page