def process_restricted_report(div, year_range, REPORTS_URL): title = div.contents[0] span = div.div.span.string.strip() report_number = span.split(': ')[0] report_date = parse_date(span.split(': ')[-1]) if not report_date: admin.log_no_date("gaoreports", report_number, title) return if report_date.year not in year_range: return None report = { 'inspector': 'gaoreports', 'inspector_url': 'https://www.gao.gov', # often GAO reports do focus on a program in a specific external agency, # but we're not attempting to discern it in a structured way. # We'll just have GAO for the inspector and the agency. 'agency': 'gao', 'agency_name': 'Government Accountability Office', 'report_id': report_number, 'unreleased': True, 'landing_url': REPORTS_URL, 'title': title, 'type': 'Unreleased report', 'published_on': datetime.datetime.strftime(report_date, "%Y-%m-%d"), } return report
def report_from(all_text, link_text, link_url, page_url, published_on, paragraph): report = { 'inspector': 'exim', 'inspector_url': 'http://www.exim.gov/about/oig', 'agency': 'exim', 'agency_name': 'Export-Import Bank of the United States' } link_text = link_text.strip() link_url = urljoin(page_url, link_url) all_text = all_text.strip() report_type = type_for(page_url, all_text, paragraph) url_match = IDENTIFIER_RE_URL.search(link_url) text_match = IDENTIFIER_RE_TEXT.search(all_text) if url_match: report_id = url_match.group(1) elif text_match: report_id = text_match.group(1) elif (page_url == PRESS_RELEASES_URL or page_url == INSPECTIONS_EVALUATIONS_SPECIAL_REPORTS_URL): report_id = link_text.replace(":", "") elif page_url == SEMIANNUAL_REPORTS_AND_TESTIMONIES_URL: report_id = os.path.splitext(link_text)[0] elif (page_url == AUDIT_REPORTS_URL and paragraph.find_previous_sibling("h2").text == "Peer Review Reports"): report_id = link_text else: raise Exception("No report ID found for %r" % link_text) # clip report_id if it gets too long report_id = report_id[:100] if published_on is None: admin.log_no_date("exim", report_id, link_text, link_url) return if link_url.endswith(".pdf"): file_type = "pdf" elif link_url.endswith(".docx"): file_type = "docx" elif link_url.endswith((".htm", ".html")): file_type = "htm" elif link_url.endswith(".cfm"): file_type = "htm" report['unreleased'] = True report['missing'] = True elif not os.path.splitext(os.path.basename(link_url))[1]: file_type = "htm" else: raise Exception("Unable to guess file type\n%r" % link_url) report['type'] = report_type report['published_on'] = datetime.strftime(published_on, "%Y-%m-%d") report['url'] = link_url report['report_id'] = report_id report['title'] = link_text report['file_type'] = file_type return report
def report_from(result, page_url, year_range): tds = result.find_all("td") if len(tds) == 1: # Title row, with colspan="3" return if len(tds) == 0: # Degenerate row return if tds[1]["align"] == "Center": # Column headers return if not result.text.strip(): # Empty spacer row return if tds[1].p is not None: title = tds[1].p.contents[0] else: title = tds[1].text title = re.sub("\\s+", " ", title).strip() links = [a["href"] for a in result.find_all("a")] if len(links) > 1: links = [link for link in links if not RE_EXTRA_FILES.search(link)] if len(links) == 0: raise Exception("Couldn't find link for {!r}".format(title)) if len(links) > 1: raise Exception("Found multiple links for {!r}".format(title)) report_url = urljoin(page_url, links[0]) report_filename = os.path.basename(report_url) report_id, extension = os.path.splitext(report_filename) published_on_text = tds[0].text.strip() for date_format in DATE_FORMATS: try: published_on = datetime.datetime.strptime(published_on_text, date_format) break except ValueError: pass else: admin.log_no_date("fcc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'fcc', 'inspector_url': 'https://www.fcc.gov/inspector-general', 'agency': 'fcc', 'agency_name': "Federal Communications Commission", 'type': 'audit', 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def peer_review_from(result, year_range): report_url = urljoin(PEER_REVIEWS_URL, result.get('href')) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] else: admin.log_no_date("archives", report_id, result.text, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return title = "Peer Review {}".format(published_on.year) report = { 'inspector': 'archives', 'inspector_url': 'https://www.archives.gov/oig/', 'agency': 'archives', 'agency_name': 'National Archives and Records Administration', 'report_id': report_id, 'url': report_url, 'title': title, 'type': 'peer_review', 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def peer_review_from(result, year_range): report_url = urljoin(PEER_REVIEWS_URL, result['href']) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] else: admin.log_no_date("archives", report_id, result.text, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return title = "Peer Review {}".format(published_on.year) report = { 'inspector': 'archives', 'inspector_url': 'https://www.archives.gov/oig/', 'agency': 'archives', 'agency_name': 'National Archives and Records Administration', 'report_id': report_id, 'url': report_url, 'title': title, 'type': 'peer_review', 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, landing_url, topic, year_range, last_published_on): try: report_link = result.select("a[href]")[0] except IndexError as exc: # There is a bug for this date where it does not have a report. # https://www.sec.gov/about/offices/oig/inspector_general_audits_reports.shtml if result.text.strip() == 'Jan. 7, 1997': return None, None else: raise exc report_url = urljoin(BASE_REPORT_URL, report_link['href']) # HTTPS, even if they haven't updated their links yet report_url = re.sub("^http://www.sec.gov", "https://www.sec.gov", report_url) logging.debug("### Processing report %s" % report_url) report_filename = report_url.split("/")[-1] report_id = os.path.splitext(report_filename)[0] report_id = report_id.replace("%20", "-") title = report_link.text.strip() report_type = TOPIC_TO_REPORT_TYPE[topic] text_lines = [line.strip() for line in result.text.split("\n")] text_lines = [line for line in text_lines if line] published_on_text = text_lines[0].split("through")[0].strip().replace(".", "") published_on = published_date_for_report(published_on_text, title, report_url, last_published_on, report_id) if not published_on: admin.log_no_date("sec", report_id, title, report_url) return None, None # Skip duplicate report if report_id == '283fin' and published_on.year == 1999 and published_on.month == 3 and published_on.day == 16: return None, published_on # Audit Memo No. 39 is posted in two locations, # https://www.sec.gov/about/offices/oig/reports/audits/2005/am39.pdf and # https://www.sec.gov/about/oig/audit/am39.pdf, # skip the second one if report_url == 'https://www.sec.gov/about/oig/audit/am39.pdf': return None, published_on if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % landing_url) return None, published_on logging.debug("### Processing report %s" % report_link) report = { 'report_id': report_id, 'type': report_type, 'topic': topic, 'url': report_url, 'landing_url': landing_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } add_common_fields(report) return report, published_on
def report_from(result, landing_url, report_type, year_range): link = result.find("a") if not link: return title = link.text report_url = urljoin(landing_url, link.get('href')) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) published_on = None try: published_on_text = result.select("td")[1].text.strip() published_on = datetime.datetime.strptime(published_on_text, '%m/%d/%y') except (ValueError, IndexError): pass try: published_on_text = result.select("td")[1].text.strip() published_on = datetime.datetime.strptime(published_on_text, '%m/%d/%Y') except (ValueError, IndexError): pass if not published_on: try: published_on_text = title.split("-")[-1].split("–")[-1].strip() published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except ValueError: pass if not published_on: if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: admin.log_no_date("nea", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'nea', 'inspector_url': 'http://arts.gov/oig', 'agency': 'nea', 'agency_name': 'National Endowment for the Arts', 'type': report_type, 'landing_url': landing_url, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if report_id in MISSING_IDS: report['unreleased'] = True report['missing'] = True report['url'] = None return report
def report_from(result, year_range): link = result.find("a") report_url = urllib.parse.unquote(link.get('href')) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) report_id = urllib.parse.unquote(report_id) title = link.text report_type = None tag_text = None if "Semiannual Report to Congress" in title: report_type = "semiannual_report" else: for tag in result.select(".ul--tags li"): tag_text = tag.text.strip() if tag_text in REPORT_TYPE_MAP: report_type = REPORT_TYPE_MAP[tag_text] break if not report_type: raise Exception("Unrecognized report type %s" % tag_text) published_on = None if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] if not published_on: try: published_on_text = title.split("-")[-1].strip() published_on_text = published_on_text.replace("Sept.", "September") published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except ValueError: pass if not published_on: admin.log_no_date("peacecorps", report_id, title, report_url) return if report_id in doubled_reports: if doubled_reports[report_id] == 0: doubled_reports[report_id] += 1 else: return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'peacecorps', 'inspector_url': 'https://www.peacecorps.gov/about/inspectors-general/', 'agency': 'peacecorps', 'agency_name': 'Peace Corps', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, year_range, report_type=None): if result.name == 'a': link = result else: link = result.select("a")[-1] href = link['href'] href = href.replace("file://///cftc.gov/home/dc/MWOODLAND/Desktop/", "") report_url = urljoin(REPORTS_URL, href) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) title = link.text published_on = None if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] if not published_on: try: published_on_text = "/".join(re.search("(\w+) (\d+), (\d+)", title).groups()) published_on = datetime.datetime.strptime(published_on_text, '%B/%d/%Y') except AttributeError: pass if not published_on: try: published_on_text = "/".join(re.search("(\w+) (\d+), (\d+)", str(link.next_sibling)).groups()) published_on = datetime.datetime.strptime(published_on_text, '%B/%d/%Y') except AttributeError: pass if not published_on: admin.log_no_date("cftc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return if not report_type: report_type = extract_report_type(title) if not report_type: report_type = extract_report_type(result.find_previous("p").text) if not report_type: report_type = "other" report = { 'inspector': 'cftc', 'inspector_url': 'http://www.cftc.gov/About/OfficeoftheInspectorGeneral/index.htm', 'agency': 'cftc', 'agency_name': 'Commodity Futures Trading Commission', 'file_type': 'pdf', 'report_id': report_id, 'url': report_url, 'title': title, 'type': report_type, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, landing_url, report_type, year_range): report_url = urljoin(landing_url, result.get('href')) report_url = report_url.replace("../", "") report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) try: title = result.parent.find("em").text except AttributeError: try: title = result.parent.contents[0].text except AttributeError: title = result.parent.contents[0] # There's a typo in the link for this report, it points to the wrong file if report_id == "Report14-28-TN-17163" and title.find("Report on the Better Basics, Inc., Literacy Program for Clay, Jefferson") != -1: report_url = "http://www.arc.gov/images/aboutarc/members/IG/Report14-34-AL-17208-302-12.pdf" report_id = "Report14-34-AL-17208-302-12" published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: try: published_on_text = title.split("\u2013")[-1].strip() published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except ValueError: pass if not published_on: try: response = utils.scraper.request(method="HEAD", url=report_url) last_modified = response.headers["Last-Modified"] published_on = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z") except ValueError: pass if not published_on: admin.log_no_date("arc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'arc', 'inspector_url': 'http://www.arc.gov/oig', 'agency': 'arc', 'agency_name': 'Appalachian Regional Commission', 'report_id': report_id, 'url': report_url, 'title': title, 'type': report_type, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, landing_url, report_type, year_range): link = result.find('a') report_url = urljoin(landing_url, link['href']) report_id = os.path.basename(urlparse(report_url)[2]).rstrip('.pdf') title = re.sub("\\s+", " ", link.text).strip() if 'semiannual' in report_id: title = "Semi-Annual Report: %s" % title if title == "Report in Brief" or title.endswith("Determination Letter"): # Skip report in brief or determination letter after a full report return published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: issued_strong = result.parent.parent.parent.find("strong", text="Issued") if issued_strong: issued_on = ISSUED_DATE_EXTRACTION.search( issued_strong.parent.text) if issued_on: date_fmt = "%B %d, %Y" published_on = datetime.datetime.strptime( issued_on.group(0), date_fmt) if not published_on: published_on = extract_date_from_report_id(report_id) if not published_on: admin.log_no_date("cpb", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'cpb', 'inspector_url': 'http://www.cpb.org/oig/', 'agency': 'cpb', 'agency_name': 'Corporation for Public Broadcasting', 'file_type': 'pdf', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), 'unreleased': False, } return report
def report_from(result, report_type, base_url, year_range): link = result.find("a") if not link and result.text.strip() == ARCHIVE_PREAMBLE_TEXT: return report_url = urllib.parse.urljoin(base_url, link.get('href')) report_id, title = link.text.split(maxsplit=1) report_id = report_id.rstrip(":").rstrip(",") if report_url == AUDIT_REPORTS_ARCHIVE_URL: return if report_id == "OIG-F-21-17-01" and "Management Letter" in title: report_id += "-Management-Letter" title = title.strip() published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] else: for paren_text in re.findall('\((.*?)\)', title): try: published_on = datetime.datetime.strptime( paren_text, '%B %d, %Y') break except ValueError: pass try: published_on = datetime.datetime.strptime(paren_text, '%B %Y') break except ValueError: pass if not published_on: admin.log_no_date("nlrb", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'nlrb', 'inspector_url': "https://www.nlrb.gov/who-we-are/inspector-general", 'agency': 'nlrb', 'agency_name': "National Labor Relations Board", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def quarterly_report_from(result, year_range): report_url = result['href'] report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) groupheader = result.parent.parent.parent.parent.find("div", class_="groupheader") year = int(groupheader.text.strip()) if year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return title = "Quarterly Report to Congress, {}, {}".format(year, result.text.strip()) if report_id in QUARTERLY_REPORT_DATES: published_on = QUARTERLY_REPORT_DATES[report_id] else: published_on = None if published_on is None: try: published_on = datetime.datetime.strptime(report_id, "%B_%d_%Y_Report_to_Congress") except ValueError: pass if published_on is None: try: published_on = datetime.datetime.strptime(report_id, "%B_%d_%Y_Report_To_Congress") except ValueError: pass if published_on is None: try: published_on = datetime.datetime.strptime(report_id, "%B_%d_%Y_Quarterly_Report_to_Congress") except ValueError: pass if published_on is None: admin.log_no_date("sigtarp", report_id, title, report_url) return report = { 'inspector': 'sigtarp', 'inspector_url': "https://www.sigtarp.gov", 'agency': 'sigtarp', 'agency_name': "Special Inspector General for the Troubled Asset Relief Program", 'type': 'quarterly', 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, year_range): link = result.find("a") report_url = urljoin(REPORTS_URL, link.get('href')) report_url = urlunparse(list(urlparse(report_url)[:4]) + ["", ""]) if report_url in BLACKLIST_REPORT_URLS: return # Follow redirects to get real file names if report_url.startswith("https://www.cpsc.gov/Media/"): report_url = utils.resolve_redirect(report_url) # URLs with /PageFiles in them need to use the filename and its # directory to be unique. Other URLs can just use the filename. if "PageFiles" in report_url: # e.g. /../132643/fy11fisma.pdf -> 132643-fy11fisma.pdf report_filename = str.join("-", report_url.split("/")[-2:]) else: report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) title = link.text if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] else: date_spans = result.select(".date-display-single") if date_spans: published_on_text = date_spans[0].text published_on = datetime.datetime.strptime(published_on_text, '%A, %B %d, %Y') else: admin.log_no_date("cpsc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report_type = report_type_from_title(title) report = { 'inspector': 'cpsc', 'inspector_url': 'https://www.cpsc.gov/About-CPSC/Inspector-General/', 'agency': 'cpsc', 'agency_name': 'Consumer Product Safety Commission', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, landing_url, report_type, year_range): link = result.find("a") report_url = urljoin(landing_url, link.get('href')) title = link.text if report_url in REPORT_URL_MAPPING: report_url = REPORT_URL_MAPPING[report_url] if report_url in BLACKLIST_REPORT_URLS: return try: report_id = result.select("td")[0].text except IndexError: try: report_id = result.select("li")[0].text except IndexError: report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: try: published_on_text = title.split("-")[-1].strip() published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except ValueError: pass if not published_on: admin.log_no_date("fmc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'fmc', 'inspector_url': 'http://www.fmc.gov/bureaus_offices/office_of_inspector_general.aspx', 'agency': 'fmc', 'agency_name': 'Federal Maritime Commission', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, report_type, base_url, year_range): link = result.find("a") if not link and result.text.strip() == ARCHIVE_PREAMBLE_TEXT: return report_url = urllib.parse.urljoin(base_url, link.get('href')) report_id, title = link.text.split(maxsplit=1) report_id = report_id.rstrip(":").rstrip(",") if report_url == AUDIT_REPORTS_ARCHIVE_URL: return if report_id == "OIG-F-21-17-01" and "Management Letter" in title: report_id += "-Management-Letter" title = title.strip() published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] else: for paren_text in re.findall('\((.*?)\)', title): try: published_on = datetime.datetime.strptime(paren_text, '%B %d, %Y') break except ValueError: pass try: published_on = datetime.datetime.strptime(paren_text, '%B %Y') break except ValueError: pass if not published_on: admin.log_no_date("nlrb", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'nlrb', 'inspector_url': "https://www.nlrb.gov/who-we-are/inspector-general", 'agency': 'nlrb', 'agency_name': "National Labor Relations Board", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, landing_url, report_type, year_range): link = result.find('a') report_url = urljoin(landing_url, link['href']) report_id = os.path.basename(urlparse(report_url)[2]).rstrip('.pdf') title = re.sub("\\s+", " ", link.text).strip() if 'semiannual' in report_id: title = "Semi-Annual Report: %s" % title if title == "Report in Brief" or title.endswith("Determination Letter"): # Skip report in brief or determination letter after a full report return published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: issued_strong = result.parent.parent.parent.find("strong", text="Issued") if issued_strong: issued_on = ISSUED_DATE_EXTRACTION.search(issued_strong.parent.text) if issued_on: date_fmt = "%B %d, %Y" published_on = datetime.datetime.strptime(issued_on.group(0), date_fmt) if not published_on: published_on = extract_date_from_report_id(report_id) if not published_on: admin.log_no_date("cpb", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'cpb', 'inspector_url': 'http://www.cpb.org/oig/', 'agency': 'cpb', 'agency_name': 'Corporation for Public Broadcasting', 'file_type': 'pdf', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), 'unreleased': False, } return report
def report_from(result, landing_url, report_type, year_range): report_url = urljoin(landing_url, result.get('href')) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) report_id = unquote(report_id) report_id = "-".join(report_id.split()) report_id = report_id.replace("_", "-") title = clean_text(result.text) if not title: return published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: try: published_on_text = "-".join(re.findall('(\w+) (\d+), (\d{4})', title)[-1]) published_on = datetime.datetime.strptime(published_on_text, '%B-%d-%Y') except IndexError: pass if not published_on: try: published_on_text = "-".join(re.search('(\d+) (\w+) (\d{4})', title).groups()) published_on = datetime.datetime.strptime(published_on_text, '%d-%B-%Y') except (AttributeError, ValueError): pass if not published_on: admin.log_no_date("eac", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'eac', 'inspector_url': 'http://www.eac.gov/inspector_general/', 'agency': 'eac', 'agency_name': 'Election Assistance Commission', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def semiannual_report_from(result, year_range): report_url = result.select("a")[0].get('href') report_filename = report_url.split("/")[-1] report_id = os.path.splitext(report_filename)[0] summary = result.select("p")[0].text title = result.select("h2 > a")[0].text published_on = None try: published_on = datetime.datetime.strptime( title.split("-")[-1].strip(), '%B %d, %Y') except ValueError: pass if published_on is None: try: published_on = datetime.datetime.strptime( title.split(" to ")[-1].strip(), '%B %d, %Y') except ValueError: pass if published_on is None: try: published_on = datetime.datetime.strptime( title.split("\u2013")[-1].strip(), '%B %d, %Y') except ValueError: pass if published_on is None: admin.log_no_date("va", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_id) return report = { 'inspector': 'va', 'inspector_url': 'https://www.va.gov/oig', 'agency': 'VA', 'agency_name': "Department of Veterans Affairs", 'type': 'semiannual_report', 'report_id': report_id, 'url': report_url, 'topic': "Semiannual Report", 'summary': summary, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, landing_url, report_type, year_range): report_url = urljoin(landing_url, result.get("href")) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) report_id = unquote(report_id) report_id = "-".join(report_id.split()) title = clean_text(result.text) published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: try: published_on_text = "-".join(re.findall("(\w+) (\d+), (\d{4})", title)[-1]) published_on = datetime.datetime.strptime(published_on_text, "%B-%d-%Y") except IndexError: pass if not published_on: try: published_on_text = "-".join(re.search("(\d+) (\w+) (\d{4})", title).groups()) published_on = datetime.datetime.strptime(published_on_text, "%d-%B-%Y") except (AttributeError, ValueError): pass if not published_on: admin.log_no_date("eac", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { "inspector": "eac", "inspector_url": "http://www.eac.gov/inspector_general/", "agency": "eac", "agency_name": "Election Assistance Commission", "type": report_type, "report_id": report_id, "url": report_url, "title": title, "published_on": datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def semiannual_report_from(result, year_range): report_url = result.select("a")[0].get('href') report_filename = report_url.split("/")[-1] report_id = os.path.splitext(report_filename)[0] summary = result.select("p")[0].text title = result.select("h2 > a")[0].text published_on = None try: published_on = datetime.datetime.strptime(title.split("-")[-1].strip(), '%B %d, %Y') except ValueError: pass if published_on is None: try: published_on = datetime.datetime.strptime(title.split(" to ")[-1].strip(), '%B %d, %Y') except ValueError: pass if published_on is None: try: published_on = datetime.datetime.strptime(title.split("\u2013")[-1].strip(), '%B %d, %Y') except ValueError: pass if published_on is None: admin.log_no_date("va", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_id) return report = { 'inspector': 'va', 'inspector_url': 'https://www.va.gov/oig', 'agency': 'VA', 'agency_name': "Department of Veterans Affairs", 'type': 'semiannual_report', 'report_id': report_id, 'url': report_url, 'topic': "Semiannual Report", 'summary': summary, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def other_report_from(result, year_range): link = result.find("a") basename = os.path.splitext(os.path.basename(link["href"]))[0] report_id = clean_text(basename).replace("'", "").replace(":", "") report_id = re.sub("-+", "-", report_id) report_url = urljoin(OTHER_REPORTS_URL, link["href"]) match = OTHER_REPORT_RE.match(clean_text(link.text)) title = match.group(1) published_on_text = match.group(2) published_on = None try: published_on = datetime.datetime.strptime(published_on_text, "%B %d, %Y") except ValueError: pass if not published_on: try: published_on = datetime.datetime.strptime(published_on_text, "%b. %d, %Y") except ValueError: pass if not published_on: admin.log_no_date("ncua", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': "ncua", 'inspector_url': HOMEPAGE_URL, 'agency': "ncua", 'agency_name': "National Credit Union Administration", 'type': "other", 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def other_report_from(result, year_range): link = result.find("a") basename = os.path.splitext(os.path.basename(link["href"]))[0] report_id = clean_text(basename).replace("'", "").replace(":", "") report_id = re.sub("-+", "-", report_id) report_url = urljoin(OTHER_REPORTS_URL, link["href"]) match = OTHER_REPORT_RE.match(clean_text(link.text)) title = match.group(1) published_on_text = match.group(2) published_on = None try: published_on = datetime.datetime.strptime(published_on_text, "%B %d, %Y") except ValueError: pass if not published_on: try: published_on = datetime.datetime.strptime(published_on_text, "%b. %d, %Y") except ValueError: pass if not published_on: admin.log_no_date("ncua", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { "inspector": "ncua", "inspector_url": HOMEPAGE_URL, "agency": "ncua", "agency_name": "National Credit Union Administration", "type": "other", "report_id": report_id, "url": report_url, "title": title, "published_on": datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, year_range, report_type): path = result.get("href") html_report_url = urljoin(INSPECTOR_URL, path) html_report = utils.beautifulsoup_from_url(html_report_url) report_id = path.split('/')[-1] title = html_report.find("span", {"property": "dc:title"})['content'] fiscal_year = fiscal_year_parse(html_report) links = html_report.select(".file a") hrefs = filter_links(links) if len(hrefs) > 1: raise Exception("Found multiple links on {}:\n{}".format(html_report_url, hrefs)) if len(hrefs) == 0: raise Exception("Found no links on {}".format(html_report_url)) pdf_report_url = hrefs[0] if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] else: admin.log_no_date("eeoc", report_id, title, pdf_report_url) return if fiscal_year not in year_range: logging.debug("[%s] Skipping, not in requested range." % pdf_report_url) return report = { 'inspector': "eeoc", 'inspector_url': INSPECTOR_URL, 'agency': "eeoc", 'agency_name': "Equal Employment Opportunity Commission", 'report_id': report_id, 'url': pdf_report_url, 'title': title, 'type': report_type, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, year_range): # walk backwards through the doc to find the header title for element in result.previous_elements: if element and \ isinstance(element, Tag) and \ element.name == "span" and \ element.has_attr("class") and \ "collapseomatic" in element["class"]: header = element.text.strip().lower() break else: raise Exception("Couldn't find the header for %s" % result) if header.startswith("inspection"): category = "inspection" elif header.startswith("semiannual"): category = "semiannual_report" else: category = "other" report_id = os.path.splitext(os.path.basename(result['href']))[0] report_url = urljoin(REPORTS_URL, result['href'].strip()) title = inspector.sanitize(result.text) # Each financial/performance report is linked twice, once for the IG's # transmittal letter and independent auditor's report, and once for # the IG's "Perspective on Management and Performance Challenges." # Skip the first one and save the second if "IG's Transmittal Letter and Independent Auditor's Report" in title \ and "(pages" in title: return None elif title == "Hotline Poster": return None published_on = REPORT_PUBLISHED_MAPPING.get(title) if not published_on: published_on = REPORT_PUBLISHED_MAPPING.get(report_id) if not published_on: date_match = DATE_RE.match(title) if date_match: published_on = datetime.datetime.strptime(date_match.group(1), "%Y.%m") if date_match.lastindex == 2: title = date_match.group(2) elif header.startswith("semiannual"): title = published_on.strftime("Semiannual Report to Congress, %B %Y") else: raise Exception("No good title for %s" % report_id) if not published_on: admin.log_no_date("denali", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': "denali", 'inspector_url': "http://www.oig.denali.gov", 'agency': "denali", 'agency_name': "Denali Commission", 'report_id': report_id, 'url': report_url, 'title': title, 'type': category, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, year_range, report_type, title_prefix=None): report_url = urljoin(REPORTS_URL, result.select("a")[-1].get("href")) # Temporary hacks to account for link mistakes if report_url == "http://www.fec.gov/fecig/documents/Semi14a_000.pdf": report_url = "http://www.fec.gov/fecig/documents/Semi14a.pdf" if report_url == "http://www.fec.gov/fecig/documents/ReviewofOutstanding" \ "RecommendationsasofJune2014_001.pdf": report_url = "http://www.fec.gov/general/documents/ReviewofOutstanding" \ "RecommendationsasofJune2014.pdf" report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) published_on = None if report_url.endswith(".pdf"): # Inline report title = inspector.sanitize(result.contents[0].strip().rstrip("-")) title = re.sub("\\s+", " ", title) if title.endswith((" 200", " 201")): # some years are split up by a <span> tag title = title + result.contents[1].text else: # Some pages have separate landing pages. doc = utils.beautifulsoup_from_url(report_url) title = doc.select("h3")[1].text.strip() try: published_on_text = doc.select("h3")[2].text.strip() except IndexError: published_on_text = doc.select("h3")[1].text.strip() published_on_text = published_on_text.replace("Period ending ", "") published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') if title == "November 2016" and report_url == "http://www.fec.gov/fecig/documents/OIGSemiannualReporttoCongress-May2016-FinalPublicDistribution.pdf": # Fix copy-paste error report_url = "http://www.fec.gov/fecig/documents/OIGFall2016SARFINAL.pdf" report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) if not published_on: if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] if not published_on: try: published_on_text = "-".join( re.search('(\w+)\s+(\d{4})', title).groups()) published_on = datetime.datetime.strptime(published_on_text, '%B-%Y') except (ValueError, AttributeError): pass if title_prefix: title = "{}{}".format(title_prefix, title) if not published_on: admin.log_no_date("fec", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': "fec", 'inspector_url': "http://www.fec.gov/fecig/fecig.shtml", 'agency': "fec", 'agency_name': "Federal Election Commission", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), # Date of publication } return report
def report_from(result, landing_url, report_type, year_range): title = result.select("td")[-1].text title = re.sub("\\s+", " ", title) report_id_match = REPORT_ID_RE.match(result.td.text.strip()) if ("contains sensitive information" in title or "This correspondence will not be posted" in title or title in UNPUBLISHED_REPORT_TITLES): unreleased = True report_url = None if report_id_match: report_id = report_id_match.group(0) else: report_id = inspector.slugify("-".join( title.strip().split())[:100]) else: unreleased = False link = result.find("a") report_id = inspector.slugify(link.text.strip()) if link.get('href') == "#": unreleased = True report_url = None else: report_url = urljoin(landing_url, link.get('href')) if landing_url == SEMIANNUAL_REPORTS_URL: if title.find("Transmittal Letter") != -1: report_id = report_id + "-transmittal" published_on = None try: published_on = datetime.datetime.strptime(link.text.strip(), '%m.%d.%y') except (ValueError, UnboundLocalError): pass if not published_on: if report_url: date_match = DATE_RE.search(report_url) if date_match: date_text = date_match.group(1) published_on = datetime.datetime.strptime( date_text, "%m-%d-%y") if not published_on: if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: admin.log_no_date("gpo", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'gpo', 'inspector_url': 'http://www.gpo.gov/oig/', 'agency': 'gpo', 'agency_name': 'Government Publishing Office', 'file_type': 'pdf', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url return report
def report_from(result, landing_url, topic, year_range, last_published_on): try: report_link = result.select("a[href]")[0] except IndexError as exc: # There is a bug for this date where it does not have a report. # https://www.sec.gov/about/offices/oig/inspector_general_audits_reports.shtml if result.text.strip() == 'Jan. 7, 1997': return None, None else: raise exc report_url = urljoin(BASE_REPORT_URL, report_link['href']) # HTTPS, even if they haven't updated their links yet report_url = re.sub("^http://www.sec.gov", "https://www.sec.gov", report_url) logging.debug("### Processing report %s" % report_url) report_filename = report_url.split("/")[-1] report_id = os.path.splitext(report_filename)[0] report_id = report_id.replace("%20", "-") title = report_link.text.strip() report_type = TOPIC_TO_REPORT_TYPE[topic] text_lines = [line.strip() for line in result.text.split("\n")] text_lines = [line for line in text_lines if line] published_on_text = text_lines[0].split("through")[0].strip().replace( ".", "") published_on = published_date_for_report(published_on_text, title, report_url, last_published_on, report_id) if not published_on: admin.log_no_date("sec", report_id, title, report_url) return None, None # Skip duplicate report if report_id == '283fin' and published_on.year == 1999 and published_on.month == 3 and published_on.day == 16: return None, published_on # Audit Memo No. 39 is posted in two locations, # https://www.sec.gov/about/offices/oig/reports/audits/2005/am39.pdf and # https://www.sec.gov/about/oig/audit/am39.pdf, # skip the second one if report_url == 'https://www.sec.gov/about/oig/audit/am39.pdf': return None, published_on if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % landing_url) return None, published_on logging.debug("### Processing report %s" % report_link) report = { 'report_id': report_id, 'type': report_type, 'topic': topic, 'url': report_url, 'landing_url': landing_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } add_common_fields(report) return report, published_on
def report_from(result, year_range): title = result.find("em").text.strip() landing_url = REPORTS_URL hrefs = [a.get("href").strip() for a in result.find_all("a")] hrefs = [href for href in hrefs if href] if hrefs: unreleased = False report_url = urljoin(REPORTS_URL, hrefs[-1]) else: unreleased = True report_url = None if report_url == "https://www.fdicig.gov/semi-reports/sar2003mar/" \ "oigsemi-03-09.pdf": # This URL is a typo, results in 404 report_url = "https://www.fdicig.gov/semi-reports/Semi2003OCT/sarOCT03.shtml" if report_url == "https://www.fdicig.gov/semi-reports/sar2009mar/" \ "oigsemi-03-09.pdf" and \ title == "FDIC Office of Inspector General's Semiannual Report to " \ "the Congress 4/1/2009 - 9/30/2009": # This URL points to the wrong report report_url = "https://www.fdicig.gov/semi-reports/SAROCT09/" \ "OIGSemi_FDIC_09-9-09.pdf" if report_url == "https://www.fdicig.gov/press/pr-08-24-12.shtml" and \ title == "Bank President Imprisoned for Embezzlement": # The title and URL don't match, and both were copied from other reports, # so we skip this entry return None report_type_text = result.select("td")[0].text if report_type_text in RECORD_TYPE_BLACKLIST: return report_type = type_for_report(report_type_text) if report_url and report_url != GENERIC_MISSING_REPORT_URL: report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) if report_url.find("/evaluations/") != -1: if not report_url.endswith("e"): report_id = report_id + "e" else: report_id = "-".join(title.split())[:50] report_id = report_id.replace(":", "") if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] else: published_on_text = result.select("td")[2].text try: published_on = datetime.datetime.strptime(published_on_text, '%m/%d/%Y') except ValueError: print(result) if report_url: admin.log_no_date("fdic", report_id, title, report_url) else: admin.log_no_date("fdic", report_id, title) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return missing = False if report_url == GENERIC_MISSING_REPORT_URL: missing = True unreleased = True report_url = None report = { 'inspector': "fdic", 'inspector_url': "https://www.fdicig.gov", 'agency': "fdic", 'agency_name': "Federal Deposit Insurance Corporation", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url if missing: report['missing'] = missing return report
def rss_report_from(result, year_range): report_url = result.find("link").next_sibling.strip() if report_url.rstrip("/") == 'http://www.si.edu/oig': # This is the default url the IG uses for announcements of things like # a website redesign or changes to the RSS feed. return if report_url == "http://www.si.edu/oig/OIGStratPlan.pdf": # This strategic plan is no longer on the website, but it is reproduced in # multiple semiannual reports, so we skip it here. return if report_url in RSS_BROKEN_LINKS: report_url = RSS_BROKEN_LINKS[report_url] else: report_url = report_url.replace("/OIG/SAR/Semiannual_Reports/", "/OIG/SAR/") report_url = report_url.replace("/oig/Semiannual_Reports/", "/Content/OIG/SAR/") report_url = report_url.replace("/oig/AuditReports/", "/Content/OIG/Audits/") report_url = report_url.replace("/oig/ARRA_Reports/", "/Content/OIG/Audits/") file_type = None if not report_url.endswith(".pdf"): file_type = "html" report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) if report_id in report_ids_seen: return report_ids_seen.add(report_id) title = result.find("title").text report_type = report_type_from_url(report_url) published_on = None published_on_text = result.find("pubdate").text try: published_on = datetime.datetime.strptime( published_on_text, '%a, %d %b %Y %H:%M:%S %z').date() except ValueError: pass if not published_on: try: published_on = datetime.datetime.strptime( published_on_text, '%a, %d %B %Y %H:%M:%S %z').date() except ValueError: pass if not published_on: admin.log_no_date("smithsonian", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'smithsonian', 'inspector_url': 'https://www.si.edu/OIG', 'agency': 'smithsonian', 'agency_name': 'Smithsonian Institution', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if file_type: report['file_type'] = file_type return report
def report_from(result, year_range, topic, subtopic_url, subtopic=None): # Ignore links to other subsections if result.get('class') and result['class'][0] == 'crossref': return if result.name == 'a': # Sometimes we already have a link result_link = result else: result_link = result.find("a") # No link found, this is probably just an extra <li> on the page. if result_link is None: return # If this is just a anchor link on the same page, skip if not strip_url_fragment(result_link['href']): return title = result_link.text title = title.replace("\xe2\x80\x93", "-") title = inspector.sanitize(title) title = re.sub('\s+', ' ', title) if title in TITLE_NORMALIZATION: title = TITLE_NORMALIZATION[title] if title in BLACKLIST_TITLES: return report_url = urljoin(subtopic_url, result_link['href']).strip() if report_url in REPORT_URL_MAPPING: report_url = REPORT_URL_MAPPING[report_url] # Fix copy-paste error in link if (title == "Medicare Compliance Review of Altru Hospital for " "2012 and 2013" and report_url == "http://oig.hhs.gov/oas/reports/region4/41408036.asp"): report_url = "http://oig.hhs.gov/oas/reports/region7/71505070.asp" # Ignore reports from other sites if BASE_URL not in report_url: return if report_url in BLACKLIST_REPORT_URLS: return if report_url in OEI_COMBINED_LANDING_PAGES: report_url = OEI_COMBINED_LANDING_PAGES[report_url][title] report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) if report_filename == "11302505.pdf": report_id = report_id + "_early_alert" # Try a quick check from the listing page to see if we can bail out based on # the year try: published_on_text = result.find_previous("dt").text.strip() published_on = datetime.datetime.strptime(published_on_text, "%m-%d-%Y") except (AttributeError, ValueError): published_on = None if published_on and published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return # This report is listed twice, once with the wrong date if published_on and published_on.year == 2012 and published_on.month == 1 and \ published_on.date == 12 and report_id == "20901002": return if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] else: # Process reports with landing pages if extension.lower() != '.pdf': report_url, published_on = report_from_landing_url(report_url) else: published_on = published_on_from_inline_link( result, report_filename, title, report_id, report_url, ) if not published_on: admin.log_no_date("hhs", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return result = { 'inspector': 'hhs', 'inspector_url': 'http://oig.hhs.gov', 'agency': 'hhs', 'agency_name': 'Health & Human Services', 'report_id': report_id, 'topic': topic.strip(), 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if subtopic: result['subtopic'] = subtopic return result
def report_from(result, landing_url, report_type, year_range): link = result.find("a") report_url = urljoin(landing_url, link.get('href').strip()) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) title = link.text file_type = None unreleased = False if "Non Public Report" in title.replace("-", " "): # Normalize title for easier detection unreleased = True landing_url = report_url report_url = None elif not report_url.endswith(".pdf"): # A link to an html report file_type = "html" estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: if not os.path.splitext(report_filename)[1]: report_doc = utils.beautifulsoup_from_url(report_url) if report_doc: time_tag = report_doc.time if time_tag: date = report_doc.time["datetime"] published_on = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") if not published_on: if landing_url == SEMIANNUAL_REPORTS_URL: fy_match = re.match("Fiscal Year ([0-9]{4})", title) if fy_match: year = int(fy_match.group(1)) if "(First Half)" in title: published_on = datetime.datetime(year, 3, 31) estimated_date = True elif "(Second Half)" in title: published_on = datetime.datetime(year, 9, 30) estimated_date = True if not published_on: admin.log_no_date("ftc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'ftc', 'inspector_url': "https://www.ftc.gov/about-ftc/office-inspector-general", 'agency': 'ftc', 'agency_name': "Federal Trade Commission", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url if file_type: report['file_type'] = file_type return report
def report_from(result, landing_url, report_type, year_range): title = result.select("td")[-1].text title = re.sub("\\s+", " ", title) report_id_match = REPORT_ID_RE.match(result.td.text.strip()) if ("contains sensitive information" in title or "This correspondence will not be posted" in title or title == "Unscheduled and Unpaid Absenteeism in the Office of " "Plant Operations"): unreleased = True report_url = None if report_id_match: report_id = report_id_match.group(0) else: report_id = inspector.slugify("-".join(title.strip().split())[:100]) else: unreleased = False link = result.find("a") report_id = inspector.slugify(link.text.strip()) if link.get('href') == "#": unreleased = True report_url = None else: report_url = urljoin(landing_url, link.get('href')) if landing_url == SEMIANNUAL_REPORTS_URL: if title.find("Transmittal Letter") != -1: report_id = report_id + "-transmittal" published_on = None try: published_on = datetime.datetime.strptime(link.text.strip(), '%m.%d.%y') except (ValueError, UnboundLocalError): pass if not published_on: if report_url: date_match = DATE_RE.search(report_url) if date_match: date_text = date_match.group(1) published_on = datetime.datetime.strptime(date_text, "%m-%d-%y") if not published_on: if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: admin.log_no_date("gpo", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'gpo', 'inspector_url': 'http://www.gpo.gov/oig/', 'agency': 'gpo', 'agency_name': 'Government Publishing Office', 'file_type': 'pdf', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url return report
def report_from(result, landing_url, report_type, year_range): link = result.find("a") if link: title = link.text report_url = link.get('href') unreleased = False else: title = result.select("div.views-field-title")[0].text report_url = None unreleased = True published_on = None try: published_on_text = result.select("span.date-display-single")[0].text published_on = datetime.datetime.strptime(published_on_text, '%m/%d/%Y') except IndexError: pass if not published_on: try: title_text = result.select("div.views-field-title span")[0].text.strip() date_match = DATE_RE.match(title_text) published_on_text = date_match.group(0) published_on = datetime.datetime.strptime(published_on_text, "%B %d, %Y") title = title_text[date_match.end():] except (IndexError, AttributeError): pass if not published_on: admin.log_no_date("usaid", report_url, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return try: report_id_text = result.select("div.views-field-field-auditreport-doc-1")[0].text.strip() report_id = "-".join(report_id_text.replace("/", "-").replace(":", "").split()) except IndexError: report_id = None if not report_id and report_url: report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) if not report_id: report_id = "{}-{}".format("-".join(title.split()), published_on_text) report_id = report_id.replace("/", "-") if title.startswith("Follow-Up"): report_id = report_id + "-follow-up" if report_url == "https://oig.usaid.gov/sites/default/files/audit-reports/" \ "0-000-12-001-s_0.pdf": # Two versions of this report have been uploaded report_id = report_id + "_final" if report_url == "https://oig.usaid.gov/sites/default/files/audit-reports/" \ "1-520-01-010-p_0.pdf": # This file has been uploaded twice, once with "_0" and once without return None if report_url in MISMATCHED_REPORT_URLS: # The report number and PDF file for these reports are copies of unrelated # reports report_id = "-".join(re.split("[^a-z]+", title.lower())) report_url = None unreleased = True report = { 'inspector': "usaid", 'inspector_url': "https://oig.usaid.gov", 'agency': "usaid", 'agency_name': "Agency For International Development", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url del report['url'] return report
def report_from(result, year_range, report_type, title_prefix=None): report_url = urljoin(REPORTS_URL, result.select("a")[-1].get("href")) # Temporary hacks to account for link mistakes if report_url == "http://www.fec.gov/fecig/documents/Semi14a_000.pdf": report_url = "http://www.fec.gov/fecig/documents/Semi14a.pdf" if report_url == "http://www.fec.gov/fecig/documents/ReviewofOutstanding" \ "RecommendationsasofJune2014_001.pdf": report_url = "http://www.fec.gov/general/documents/ReviewofOutstanding" \ "RecommendationsasofJune2014.pdf" report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) published_on = None if report_url.endswith(".pdf"): # Inline report title = inspector.sanitize(result.contents[0].strip().rstrip("-")) title = re.sub("\\s+", " ", title) if title.endswith((" 200", " 201")): # some years are split up by a <span> tag title = title + result.contents[1].text else: # Some pages have separate landing pages. doc = utils.beautifulsoup_from_url(report_url) title = doc.select("h3")[1].text.strip() try: published_on_text = doc.select("h3")[2].text.strip() except IndexError: published_on_text = doc.select("h3")[1].text.strip() published_on_text = published_on_text.replace("Period ending ", "") published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') if title == "November 2016" and report_url == "http://www.fec.gov/fecig/documents/OIGSemiannualReporttoCongress-May2016-FinalPublicDistribution.pdf": # Fix copy-paste error report_url = "http://www.fec.gov/fecig/documents/OIGFall2016SARFINAL.pdf" report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) if not published_on: if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] if not published_on: try: published_on_text = "-".join(re.search('(\w+)\s+(\d{4})', title).groups()) published_on = datetime.datetime.strptime(published_on_text, '%B-%Y') except (ValueError, AttributeError): pass if title_prefix: title = "{}{}".format(title_prefix, title) if not published_on: admin.log_no_date("fec", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': "fec", 'inspector_url': "http://www.fec.gov/fecig/fecig.shtml", 'agency': "fec", 'agency_name': "Federal Election Commission", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), # Date of publication } return report
def report_from(result, year_range): report_url = urljoin(RECENT_AUDITS_URL, result.get('href')) if report_url in URL_BLACKLIST: return None # Strip extra path adjustments report_url = report_url.replace("../", "") summary = None if not report_url.endswith(".pdf"): # Some reports link to other page which link to the full report report_page = utils.beautifulsoup_from_url(report_url) relative_report_url = report_page.select("div.block a[href]")[0]['href'] report_url = urljoin(report_url, relative_report_url) # Strip extra path adjustments report_url = report_url.replace("../", "") summary = "\n".join(paragraph.text for paragraph in report_page.select("div.grid_12 p")) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) if report_id in report_ids_seen: return report_ids_seen.add(report_id) title = result.text.strip() report_type = report_type_from_url(report_url) if not title: return None estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: try: published_on_text = "/".join(re.search('(\w+) (\d+), (\d+)', title).groups()) published_on = datetime.datetime.strptime(published_on_text, '%B/%d/%Y') except AttributeError: pass if not published_on: month_year_match = MONTH_YEAR_RE.search(result.text) if month_year_match: date_text = ' '.join(month_year_match.group(0).split()) published_on = datetime.datetime.strptime(date_text, '%B %Y') estimated_date = True if not published_on: admin.log_no_date("smithsonian", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'smithsonian', 'inspector_url': 'https://www.si.edu/OIG', 'agency': 'smithsonian', 'agency_name': 'Smithsonian Institution', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if summary: report['summary'] = summary if estimated_date: report['estimated_date'] = estimated_date return report
def report_from(result, landing_url, year_range): report_url = urljoin(landing_url, result.get('href')) # HTTPS, even if they haven't updated their links yet report_url = re.sub("^http://www.fca.gov", "https://www.fca.gov", report_url) if landing_url + '#' in report_url: # These are just anchor links, skip them. return if result.find_parent("ul") and result.find_parent("ul").get('type') == 'disc': # These are just anchor links, skip them. return title = clean_text(result.text) if title == 'Inspector General Reports': # Just a return link to the main IG page return report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) published_on = None if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] if not published_on: try: li = result.parent if li.name == "u": li = li.parent published_on_text = li.contents[1].lstrip(",").split("(")[0].strip() except (IndexError, TypeError): published_on_text = result.text.strip() published_on_text = clean_text(published_on_text) try: published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except ValueError: pass if not published_on: try: published_on_text = li.contents[1].strip().lstrip("(").rstrip(")") published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except (IndexError, TypeError, ValueError): pass if not published_on: try: published_on_text = "/".join(re.search("(\w{3}).* (\d{4})", published_on_text).groups()) published_on = datetime.datetime.strptime(published_on_text, '%b/%Y') except AttributeError: pass if not published_on: admin.log_no_date("fca", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report_type_text = result.find_previous("p", class_="mainContentheader2").text.strip() report_type = type_for_report(report_type_text) report = { 'inspector': 'fca', 'inspector_url': 'https://www.fca.gov/home/inspector.html', 'agency': 'fca', 'agency_name': 'Farm Credit Administration', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, landing_url, report_type, year_range): td = result.select("td")[1] link = td.a if link: title = re.sub("\\s+", " ", link.text.strip()) unreleased = False report_url = urljoin(landing_url, link.get('href')) report_filename = report_url.split("/")[-1] report_filename = IE_DOWNLOAD_SUFFIX_RE.sub("", report_filename) report_id, _ = os.path.splitext(report_filename) else: title = re.sub("\\s+", " ", td.text.strip()) title = title.replace(" (Unavailable)", "") unreleased = True report_url = None published_on = None published_on_match = DATE_RE.search(td.text) if published_on_match: published_on_text = published_on_match.group(1) published_on = datetime.datetime.strptime(published_on_text, "%m/%d/%Y") if published_on is None and link is not None: sar_match = SAR_RE.search(link.text) if sar_match: published_on = datetime.datetime.strptime(sar_match.group(1), "%B %Y") else: if report_id in REPORT_ID_PUBLISHED_MAP: published_on = REPORT_ID_PUBLISHED_MAP[report_id] if link is None and published_on is None: if title in REPORT_TITLE_PUBLISHED_MAP: published_on = REPORT_TITLE_PUBLISHED_MAP[title] else: admin.log_no_date("rrb", "?", title) return if link is None: report_id = "{}-{}".format(published_on.strftime("%m-%d-%y"), "-".join(title.split()))[:50] if published_on is None: admin.log_no_date("rrb", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % title) return report = { 'inspector': 'rrb', 'inspector_url': "http://www.rrb.gov/oig/", 'agency': 'rrb', 'agency_name': "Railroad Retirement Board", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url return report
def report_from(result, landing_url, report_type, year_range): if not result.text or result.text in BLACKLIST_REPORT_TITLES: # There are a few empty links due to bad html and some links for alternative # formats (PDF) that we will just ignore. return link_text = None if result.name == 'a': report_url = result.get('href') link_text = inspector.sanitize(result.text) title = inspector.sanitize("%s %s" % (result.text, result.next_sibling)) else: links = [link for link in result.find_all('a') if link.text.strip()] report_url = links[0].get('href') link_text = inspector.sanitize(result.a.text) title = inspector.sanitize(result.text) report_url = urljoin(landing_url, report_url) report_filename = os.path.basename(report_url) if title.endswith("PDF"): title = title[:-3] title = title.rstrip(" .") prev = result.previous_sibling if isinstance(prev, NavigableString) and "See, also:" in prev: return None report_no_match = REPORT_NO_RE.match(link_text) if report_no_match: report_id = report_no_match.group(0) if "fraud" in report_url.lower(): report_id = "fraud-alert-" + report_id elif "Client_Trust_Fund" in report_url: report_id = "CTF-" + report_id elif report_filename.startswith("sr"): report_id = "special-report-" + report_id else: report_id, _ = os.path.splitext(report_filename) report_id = unquote(report_id) report_id = "-".join(report_id.split()) report_id = report_id.replace("\\", "") # strip backslashes estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] elif link_text == "June 2015": published_on = datetime.datetime(2015, 6, 1) else: published_on_text = None try: published_on_text = re.search('(\d+/\d+/\d+)', title).groups()[0] except AttributeError: pass if not published_on_text: try: published_on_text = re.search('(\w+ \d+, \d+)', title).groups()[0] except AttributeError: pass if not published_on_text: try: published_on_text = re.search('(\d+/\d+)', title).groups()[0] except AttributeError: pass if not published_on_text: admin.log_no_date("lsc", report_id, title, report_url) return if not published_on: datetime_formats = [ '%B %d, %Y', '%m/%d/%Y', '%m/%d/%y', '%m/%Y', '%m/%y' ] for datetime_format in datetime_formats: try: published_on = datetime.datetime.strptime(published_on_text, datetime_format) except ValueError: pass else: break if not published_on: admin.log_no_date("lsc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'lsc', 'inspector_url': 'https://www.oig.lsc.gov', 'agency': 'lsc', 'agency_name': 'Legal Services Corporation', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date if report_url in ("https://www.oig.lsc.gov/core-legal-services"): report['file_type'] = "html" if report_url.startswith("https://oig.lsc.gov/mapping/references/eval"): report['unreleased'] = True report['missing'] = True return report
def run(options): year_range = inspector.year_range(options, archive) report_seen_flag = False for url in years_to_index_urls(year_range): index = utils.beautifulsoup_from_url(url) tables = index.find_all("table") lis = index.select("ul.field li") if len(tables) >= 1: table = tables[0] trs = table.select('tr') for tr in trs: tds = tr.select('td') if not tds: continue if RE_YEAR.match(tds[0].text): continue if "".join(td.text for td in tds).strip() == "": continue report_seen_flag = True try: published_on_dt = parse_date(tds[0].text.strip()) except Exception: pass if not published_on_dt: try: published_on_dt = parse_date(tds[2].text.strip()) except Exception: pass if not published_on_dt: admin.log_no_date("epa", tds[2].text, tds[1].text) continue if published_on_dt.year not in year_range: continue report = report_from_table(tds, published_on_dt, url) if report: inspector.save_report(report) else: for li in lis: report_seen_flag = True date_match = RE_DATE.search(li.text) if date_match: published_on_dt = parse_date(date_match.group()) else: href = urljoin(url, li.a["href"]) if href in REPORT_PUBLISHED_MAPPING: published_on_dt = REPORT_PUBLISHED_MAPPING[href] if not published_on_dt: admin.log_no_date("epa", extract_url(li), li.a.text, href) continue if published_on_dt.year not in year_range: continue report = report_from_list(li, published_on_dt, url) if report: inspector.save_report(report) if not report_seen_flag: raise inspector.NoReportsFoundError("EPA")
def report_from(result, landing_url, report_type, year_range): report_url = urljoin(landing_url, result.get('href')) report_url = report_url.replace("../", "") report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) try: title = result.parent.find("em").text except AttributeError: try: title = result.parent.contents[0].text except AttributeError: title = result.parent.contents[0] # There's a typo in the link for this report, it points to the wrong file if report_id == "Report14-28-TN-17163" and title.find( "Report on the Better Basics, Inc., Literacy Program for Clay, Jefferson" ) != -1: report_url = "http://www.arc.gov/images/aboutarc/members/IG/Report14-34-AL-17208-302-12.pdf" report_id = "Report14-34-AL-17208-302-12" published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: try: published_on_text = title.split("\u2013")[-1].strip() published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except ValueError: pass if not published_on: try: response = utils.scraper.request(method="HEAD", url=report_url) last_modified = response.headers["Last-Modified"] published_on = datetime.datetime.strptime( last_modified, "%a, %d %b %Y %H:%M:%S %Z") except ValueError: pass if not published_on: admin.log_no_date("arc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'arc', 'inspector_url': 'http://www.arc.gov/oig', 'agency': 'arc', 'agency_name': 'Appalachian Regional Commission', 'report_id': report_id, 'url': report_url, 'title': title, 'type': report_type, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, year_range): report_url = urljoin(RECENT_AUDITS_URL, result.get('href')) if report_url in URL_BLACKLIST: return None # Strip extra path adjustments report_url = report_url.replace("../", "") summary = None if not report_url.endswith(".pdf"): # Some reports link to other page which link to the full report report_page = utils.beautifulsoup_from_url(report_url) relative_report_url = report_page.select( "div.block a[href]")[0]['href'] report_url = urljoin(report_url, relative_report_url) # Strip extra path adjustments report_url = report_url.replace("../", "") summary = "\n".join( paragraph.text for paragraph in report_page.select("div.grid_12 p")) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) if report_id in report_ids_seen: return report_ids_seen.add(report_id) title = result.text.strip() report_type = report_type_from_url(report_url) if not title: return None estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: try: published_on_text = "/".join( re.search('(\w+) (\d+), (\d+)', title).groups()) published_on = datetime.datetime.strptime(published_on_text, '%B/%d/%Y') except AttributeError: pass if not published_on: month_year_match = MONTH_YEAR_RE.search(result.text) if month_year_match: date_text = ' '.join(month_year_match.group(0).split()) published_on = datetime.datetime.strptime(date_text, '%B %Y') estimated_date = True if not published_on: admin.log_no_date("smithsonian", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'smithsonian', 'inspector_url': 'https://www.si.edu/OIG', 'agency': 'smithsonian', 'agency_name': 'Smithsonian Institution', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if summary: report['summary'] = summary if estimated_date: report['estimated_date'] = estimated_date return report
def process_report(result, year_range): """Use the report ID obtained from HTML to hit GAO's API""" # <a href="/assets/690/685452.pdf">View Report (PDF, 8 pages)</a> # 685452 is the ID used by the API. # The link's path looks like "/products/GAO-17-558", use the last part # as the report ID landing_url = urljoin('https://www.gao.gov', result.a['href']) report_number = os.path.basename(result.a['href']) title = re.sub("\\s+", " ", result.span.text).strip() description = re.sub("\\s+", " ", result.p.text).strip() dates = result.find_all('span')[-1].string.replace('\n', '').split(': ') # ['Published', 'Mar 31, 1959. Publicly Released', 'Mar 31, 1959.'] # Prefer the first, fall back to the latter if necessary--not sure it ever is published_on = parse_date(dates[1].split('.')[0].strip()) if not published_on: published_on = parse_date(dates[-1].replace('.', '').strip()) if not published_on: admin.log_no_date("gaoreports", report_number, title, landing_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % landing_url) return pdf_links = result.find_all('li', {'class': 'pdf-link'}) (report_url, highlights_url, accessible_url) = (None, None, None) for link in pdf_links: if not link.a or link.a['href'] == '': continue if 'View Report' in link.a.string: report_url = urljoin('https://www.gao.gov', link.a['href']) if 'Highlights' in link.a.string: highlights_url = urljoin('https://www.gao.gov', link.a['href']) if 'Accessible' in link.a.string: accessible_url = urljoin('https://www.gao.gov', link.a['href']) # Last PDF is full report. First one could be Highlights. try: # get the ID from one of the filenames, minus the extension api_id = os.path.splitext(os.path.basename(pdf_links[-1].a['href']))[0] except Exception: # very old reports are sometimes different api_id = os.path.splitext(os.path.basename(result.a['href']))[0] api_id = api_id.lstrip('0') if not landing_url and not report_url: logging.debug("[%s] No landing URL or PDF, skipping..." % api_id) return None api_url = "http://www.gao.gov/api/id/%s" % api_id json_response = json.loads(utils.download(api_url)) if not json_response: return None details = json_response[0] """looks like this { "youtube_id": null, "type": "reports", "content_id": "685451", "bucket_term": "Defense Management", "title": "DOD Has Taken Initial Steps to Formulate", "description": null, "rptno": "GAO-17-523R", "docdate": "2017-06-23", "actual_release_date": "2017-06-23T12:00:00Z", "actual_release_date_formatted": "Jun 23, 2017", "original_release_dt": null, "category_img": "http://www.gao.gov/images/rip/defense.jpg", "category_img_alt": "defense icon", "additional_links": "", "topics": [ "National Defense" ], "subsite": [ "Correspondence" ], "format": null, "mime_type_s": null, "ereport_flag": 0, "pdf_url": "http://www.gao.gov/assets/690/685452.pdf", "url": "http://www.gao.gov/products/GAO-17-523R", "document_type": "report", "supplement_url": null, "description_short": "" },""" if 'html_url' in details: accessible_url = details['html_url'] categories = details.get('topics', None) if not categories: # json could have null or [] categories = [] if details['bucket_term']: categories.append(details['bucket_term']) # defer to HTML instead of API for this stuff # published_on = details['docdate'] # posted_at = details['actual_release_date'][:10] # title = details['title'] # report_type = details['document_type'] # if details.get('description', None): # description = details['description'] report = { 'inspector': 'gaoreports', 'inspector_url': 'https://www.gao.gov', # often GAO reports do focus on a program in a specific external agency, # but we're not attempting to discern it. # We'll just have GAO for the inspector and the agency. 'agency': 'gao', 'agency_name': 'Government Accountability Office', 'report_id': report_number, 'landing_url': landing_url, 'url': report_url, 'title': title, 'type': details['document_type'], 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), 'highlights_url': highlights_url, 'accessible_url': accessible_url, 'description': description, 'categories': categories, 'category_img': details['category_img'], 'category_img_alt': details['category_img_alt'], 'subsite': details['subsite'] } if not report_url: report['unreleased'] = True return report
def report_from(result, page_url, report_type, year_range): try: title, date1, date2 = result.text.rsplit(",", 2) published_on_text = date1 + date2 published_on = datetime.datetime.strptime(published_on_text.strip(), '%B %d %Y') except ValueError: try: title, date1, date2, date3 = result.text.rsplit(maxsplit=3) published_on_text = date1 + date2 + date3 published_on = datetime.datetime.strptime(published_on_text.strip(), '%B%d,%Y') except ValueError: title = result.text published_on = None title = clean_text(title) original_title = title report_id, title = title.split(maxsplit=1) report_id = report_id.rstrip(":") if result.name == "a": link = result else: link = result.a report_url = urljoin(page_url, link['href']) # HTTPS, even if they haven't updated their links yet report_url = re.sub("^http://www.treasury.gov", "https://www.treasury.gov", report_url) if report_id.find('-') == -1: # If the first word of the text doesn't contain a hyphen, # then it's probably part of the title, and not a tracking number. # In this case, fall back to the URL. report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) report_id = unquote(report_id) # Reset the title, since we previously stripped off the first word # as a candidate report_id. title = original_title if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: admin.log_no_date("treasury", report_id, title, report_url) return # Skip this report, it already shows up under other audit reports if report_id == "Role of Non-Career Officials in Treasury FOIA Processing": return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'treasury', 'inspector_url': 'https://www.treasury.gov/about/organizational-structure/ig/', 'agency': 'treasury', 'agency_name': "Department of the Treasury", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def audit_report_from(result, landing_url, year, year_range): if not result.text.strip(): return link = result.find("a") report_url = urljoin(landing_url, link['href']) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) try: title = result.select("blockquote")[0].contents[0] except IndexError: title = result.text title_prefixer = re.compile( "(Advisory|Management|Audit)\\s*(Letter|Report)\\s*[\\d\\-]+:\\s*", re.I) title = title_prefixer.sub("", title) estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] cleaned_text = re.sub("\s+", " ", inspector.sanitize(result.text)) if not published_on: try: published_on_text = re.search('(\w+ \d+, \d+)', cleaned_text).groups()[0] published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except AttributeError: pass if not published_on: try: published_on_text = re.search('(\w+ \d+ , \d+)', cleaned_text).groups()[0] published_on = datetime.datetime.strptime(published_on_text, '%B %d , %Y') except AttributeError: pass if not published_on: try: response = utils.scraper.request(method="HEAD", url=report_url) last_modified = response.headers["Last-Modified"] published_on = datetime.datetime.strptime( last_modified, "%a, %d %b %Y %H:%M:%S %Z") except ValueError: pass if not published_on: admin.log_no_date("archives", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'archives', 'inspector_url': 'https://www.archives.gov/oig/', 'agency': 'archives', 'agency_name': 'National Archives and Records Administration', 'report_id': report_id, 'url': report_url, 'title': title, 'type': 'audit', 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date return report
def audit_report_from(result, page_url, year_range): if not clean_text(result.text): # Empty row return # Get all direct child nodes children = list(result.find_all(True, recursive=False)) published_on_text = clean_text(children[1].text) # this is the header row if published_on_text.strip() == "Date": return None date_formats = ['%m/%d/%Y', '%m/%d%Y', '%m/%d/%y'] published_on = None for date_format in date_formats: try: published_on = datetime.datetime.strptime(published_on_text, date_format) except ValueError: pass report_summary = clean_text(children[2].text) if not report_summary: # There is an extra row that we want to skip return report_summary = report_summary.replace("OIG-15-38Administrative", "OIG-15-38 Administrative") summary_match = SUMMARY_RE.match(report_summary) summary_match_2 = SUMMARY_FALLBACK_RE.match(report_summary) if summary_match: report_id = summary_match.expand(r"\1-\2-\3") title = summary_match.group(4) elif summary_match_2: report_id = summary_match_2.expand(r"(\2-\1-\3") title = summary_match_2.group(4) elif report_summary.startswith("IGATI") and published_on is not None: # There are two such annual reports from different years, append the year report_id = "IGATI %d" % published_on.year title = report_summary elif report_summary == "Report on the Bureau of the Fiscal Service Federal " \ "Investments Branch\u2019s Description of its Investment/" \ "Redemption Services and the Suitability of the Design and Operating " \ "Effectiveness of its Controls for the Period August 1, 2013 to " \ "July 31, 2014": # This one is missing its ID in the index report_id = "OIG-14-049" title = report_summary elif report_summary == "Correspondence related to the resolution of audit recommendation 1 OIG-16-001 OFAC Libyan Sanctions Case Study (Please read this correspondence in conjunction with the report.)": # Need to make up a report_id for this supplemental document report_id = "OIG-16-001-resolution" title = report_summary else: try: filename_match = FILENAME_RE.match(os.path.basename(result.a["href"])) report_id = filename_match.group(1) title = report_summary except (ValueError, IndexError, AttributeError): raise Exception("Couldn't parse report ID: %s" % repr(report_summary)) if report_id == 'OIG-15-015' and \ 'Financial Statements for hte Fiscal Years 2014 and 2013' in title: # This report is listed twice, once with a typo return if report_id == 'OIG-07-003' and published_on_text == '11/23/2006': # This report is listed twice, once with the wrong date return # There are copy-paste errors with several retracted reports if report_id == 'OIG-14-037': if published_on.year == 2011 or published_on.year == 2010: return if report_id == 'OIG-13-021' and published_on_text == '12/12/2012': return if published_on is None: admin.log_no_date("treasury", report_id, title) return agency_slug_text = children[0].text if report_id in REPORT_AGENCY_MAP: agency_slug = REPORT_AGENCY_MAP[report_id] else: agency_slug = clean_text(agency_slug_text.split("&")[0]).lower() if (report_id in UNRELEASED_REPORTS or "If you would like a copy of this report" in report_summary or "If you would like to see a copy of this report" in report_summary or "have been removed from the OIG website" in report_summary or "removed the auditors\u2019 reports from the" in report_summary or "Classified Report" in report_summary or "Classified Audit Report" in report_summary or "Sensitive But Unclassified" in report_summary or "To obtain further information, please contact the OIG" in report_summary or "Report is under compliance review" in report_summary): unreleased = True report_url = None landing_url = page_url else: link = result.select("a")[0] report_url = urljoin(AUDIT_REPORTS_BASE_URL, link['href']) if report_url == AUDIT_REPORTS_BASE_URL: raise Exception("Invalid link found: %s" % link) unreleased = False landing_url = None # HTTPS, even if they haven't updated their links yet if report_url is not None: report_url = re.sub("^http://www.treasury.gov", "https://www.treasury.gov", report_url) if report_url == "https://www.treasury.gov/about/organizational-structure/ig/Documents/OIG-11-071.pdf": report_url = "https://www.treasury.gov/about/organizational-structure/ig/Documents/OIG11071.pdf" if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'treasury', 'inspector_url': 'https://www.treasury.gov/about/organizational-structure/ig/', 'agency': agency_slug, 'agency_name': AGENCY_NAMES[agency_slug], 'type': 'audit', 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if unreleased: report['unreleased'] = unreleased if landing_url: report['landing_url'] = landing_url return report
def report_from(result, landing_url, report_type, year_range): link = result.find("a") report_url = urljoin(landing_url, link.get('href').strip()) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) title = link.text file_type = None unreleased = False if "Non Public Report" in title.replace( "-", " "): # Normalize title for easier detection unreleased = True landing_url = report_url report_url = None elif not report_url.endswith(".pdf"): # A link to an html report file_type = "html" estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: if not os.path.splitext(report_filename)[1]: report_doc = utils.beautifulsoup_from_url(report_url) if report_doc: time_tag = report_doc.time if time_tag: date = report_doc.time["datetime"] published_on = datetime.datetime.strptime( date, "%Y-%m-%d %H:%M:%S") if not published_on: if landing_url == SEMIANNUAL_REPORTS_URL: fy_match = re.match("Fiscal Year ([0-9]{4})", title) if fy_match: year = int(fy_match.group(1)) if "(First Half)" in title: published_on = datetime.datetime(year, 3, 31) estimated_date = True elif "(Second Half)" in title: published_on = datetime.datetime(year, 9, 30) estimated_date = True if not published_on: admin.log_no_date("ftc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'ftc', 'inspector_url': "https://www.ftc.gov/about-ftc/office-inspector-general", 'agency': 'ftc', 'agency_name': "Federal Trade Commission", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url if file_type: report['file_type'] = file_type return report
def rss_report_from(result, year_range): report_url = result.find("link").next_sibling.strip() if report_url.rstrip("/") == 'http://www.si.edu/oig': # This is the default url the IG uses for announcements of things like # a website redesign or changes to the RSS feed. return if report_url == "http://www.si.edu/oig/OIGStratPlan.pdf": # This strategic plan is no longer on the website, but it is reproduced in # multiple semiannual reports, so we skip it here. return if report_url in RSS_BROKEN_LINKS: report_url = RSS_BROKEN_LINKS[report_url] else: report_url = report_url.replace("/OIG/SAR/Semiannual_Reports/", "/OIG/SAR/") report_url = report_url.replace("/oig/Semiannual_Reports/", "/Content/OIG/SAR/") report_url = report_url.replace("/oig/AuditReports/", "/Content/OIG/Audits/") report_url = report_url.replace("/oig/ARRA_Reports/", "/Content/OIG/Audits/") file_type = None if not report_url.endswith(".pdf"): file_type = "html" report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) if report_id in report_ids_seen: return report_ids_seen.add(report_id) title = result.find("title").text report_type = report_type_from_url(report_url) published_on = None published_on_text = result.find("pubdate").text try: published_on = datetime.datetime.strptime(published_on_text, '%a, %d %b %Y %H:%M:%S %z').date() except ValueError: pass if not published_on: try: published_on = datetime.datetime.strptime(published_on_text, '%a, %d %B %Y %H:%M:%S %z').date() except ValueError: pass if not published_on: admin.log_no_date("smithsonian", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'smithsonian', 'inspector_url': 'https://www.si.edu/OIG', 'agency': 'smithsonian', 'agency_name': 'Smithsonian Institution', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if file_type: report['file_type'] = file_type return report
def report_from(result, landing_url, year_range): report_url = urljoin(landing_url, result.get('href')) # HTTPS, even if they haven't updated their links yet report_url = re.sub("^http://www.fca.gov", "https://www.fca.gov", report_url) if landing_url + '#' in report_url: # These are just anchor links, skip them. return if result.find_parent("ul") and result.find_parent("ul").get('type') == 'disc': # These are just anchor links, skip them. return title = clean_text(result.text) if title == 'Inspector General Reports': # Just a return link to the main IG page return report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) if report_url == "https://www.fca.gov/Download/InspectorGeneral/Inspectionrpts/TravelCardProgram.pdf": report_id = "TravelCardProgram-2017" if report_url == "https://www.fca.gov/Download/InspectorGeneral/Inspectionrpts/PurchaseCardProgram.pdf": report_id = "PurchaseCardProgram-2017" published_on = None if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] if not published_on: try: li = result.parent if li.name == "u": li = li.parent published_on_text = li.contents[1].lstrip(",").split("(")[0].strip() except (IndexError, TypeError): published_on_text = result.text.strip() published_on_text = clean_text(published_on_text) try: published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except ValueError: pass if not published_on: try: published_on_text = li.contents[1].strip().lstrip("(").rstrip(")") published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except (IndexError, TypeError, ValueError): pass if not published_on: try: published_on_text = "/".join(re.search("(\w{3}).* (\d{4})", published_on_text).groups()) published_on = datetime.datetime.strptime(published_on_text, '%b/%Y') except AttributeError: pass if not published_on: admin.log_no_date("fca", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report_type_text = result.find_previous("p", class_="mainContentheader2").text.strip() report_type = type_for_report(report_type_text) report = { 'inspector': 'fca', 'inspector_url': 'https://www.fca.gov/home/inspector.html', 'agency': 'fca', 'agency_name': 'Farm Credit Administration', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, year_range, report_type=None): if result.name == 'a': link = result else: link = result.select("a")[-1] href = link['href'] href = href.replace("file://///cftc.gov/home/dc/MWOODLAND/Desktop/", "") report_url = urljoin(REPORTS_URL, href) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) title = link.text published_on = None if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] if not published_on: try: published_on_text = "/".join( re.search("(\w+) (\d+), (\d+)", title).groups()) published_on = datetime.datetime.strptime(published_on_text, '%B/%d/%Y') except AttributeError: pass if not published_on: try: published_on_text = "/".join( re.search("(\w+) (\d+), (\d+)", str(link.next_sibling)).groups()) published_on = datetime.datetime.strptime(published_on_text, '%B/%d/%Y') except AttributeError: pass if not published_on: admin.log_no_date("cftc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return if not report_type: report_type = extract_report_type(title) if not report_type: report_type = extract_report_type(result.find_previous("p").text) if not report_type: report_type = "other" report = { 'inspector': 'cftc', 'inspector_url': 'http://www.cftc.gov/About/OfficeoftheInspectorGeneral/index.htm', 'agency': 'cftc', 'agency_name': 'Commodity Futures Trading Commission', 'file_type': 'pdf', 'report_id': report_id, 'url': report_url, 'title': title, 'type': report_type, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report