def get_urls_to_download() -> Set[str]:
    page = requests.get(STATE_AGGREGATE_URL, verify=False).text
    html_tree = html.fromstring(page)
    links = html_tree.xpath("//a/@href")

    aggregate_report_urls = set()
    for link in links:
        link = unquote(link)
        if "weekly jail" in link.lower() and "pdf" in link.lower():
            # This report is missing and the page gives a 404
            if link.endswith("09-12-19.pdf"):
                continue
            # Fix typo in link for ​​October 17, 2019
            if link.endswith("12-17-19.pdf"):
                link = link.replace("12-17-19", "10-17-19")
            elif link.endswith("09-26-19.pdf"):
                link = link.replace("09-26-19.pdf", "09-26-19 new.pdf")
            elif link.endswith("11-14-19.pdf"):
                link = link.replace("2020", "2019")
            else:
                # Make sure we only take things after Aug 9th 2018 as the format
                # changed before that.
                d = parse_date(link)
                if d < ACCEPTABLE_DATE:
                    continue
            url = BASE_URL.format(link)
            aggregate_report_urls.add(url)
    return aggregate_report_urls
Beispiel #2
0
def get_urls_to_download() -> Set[str]:
    page = requests.get(STATE_AGGREGATE_URL).text
    html_tree = html.fromstring(page)
    links = html_tree.xpath('//a/@href')

    aggregate_report_urls = set()
    for link in links:
        link = unquote(link)
        if 'weekly jail' in link.lower() and 'pdf' in link.lower():
            # Make sure we only take things after Aug 9th 2018 as the format
            # changed before that.
            d = parse_date(link)
            if d >= ACCEPTABLE_DATE:
                url = BASE_URL.format(link)
                aggregate_report_urls.add(url)
    return aggregate_report_urls
Beispiel #3
0
def get_urls_to_download() -> Set[str]:
    page = requests.get(STATE_AGGREGATE_URL).text
    html_tree = html.fromstring(page)
    links = html_tree.xpath('//a/@href')

    aggregate_report_urls = set()
    for link in links:
        link = unquote(link)
        if 'weekly jail' in link.lower() and 'pdf' in link.lower():
            # Fix typo in link for ​​October 17, 2019
            if link.endswith('12-17-19.pdf'):
                link = link.replace('12-17-19', '10-17-19')
            elif link.endswith('09-26-19.pdf'):
                link = link.replace('09-26-19.pdf', '09-26-19 new.pdf')
            else:
                # Make sure we only take things after Aug 9th 2018 as the format
                # changed before that.
                d = parse_date(link)
                if d < ACCEPTABLE_DATE:
                    continue
            url = BASE_URL.format(link)
            aggregate_report_urls.add(url)
    return aggregate_report_urls