def run(options): year_range = inspector.year_range(options, archive) pages = options.get('pages', ALL_PAGES) # Pull the audit reports. Pages are 0-indexed. for page in range(0, int(pages) - 1): doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(page=page))) results = doc.select("span.field-content") if not results: # No more results, we must have hit the last page break for result in results: report = report_from(result, year_range, report_type='audit') if report: inspector.save_report(report) # Grab the other reports for report_type, url in OTHER_REPORT_URLS.items(): doc = BeautifulSoup(utils.download(url)) results = doc.select(".views-field") if not results: results = doc.select(".views-row") for result in results: report = report_from(result, year_range, report_type) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2006: # The oldest year for audit reports continue url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(url)) results = doc.select("div#content li") for result in results: report = audit_report_from(result, url, year, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div#content li") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the Peer Review doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL)) result = doc.find("div", id='content').find("a", text=True) report = peer_review_from(result, year_range) inspector.save_report(report)
def urls_for_topics(self, topics): for topic in topics: # Topic might be a tuple for ADDITIONAL_TOPICS (not ones from command # line). self.report_type = None if isinstance(topic, tuple): topic, report_type = topic self.report_type = report_type last_page = False url = TOPIC_TO_URL[topic] page = BeautifulSoup(utils.download(url)) page_started = self.is_first_page(page) if page_started: yield url for link in page.select('li.pager-item a'): next_url = urljoin(url, link['href']) next_page = BeautifulSoup(utils.download(next_url)) if not page_started: page_started = self.is_first_page(next_page) if page_started: yield next_url last_page = self.is_last_page(next_page) if last_page: break if last_page: continue self.report_type = None # Clear this out afterwards
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: url = AUDITS_REPORTS_URL.format(str(year)[2:4]) doc = BeautifulSoup(utils.download(url)) results = doc.select("tr") if not results: raise inspector.NoReportsFoundError("NASA (%d)" % year) for index, result in enumerate(results): if not index or not result.text.strip(): # Skip the header row and any empty rows continue report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) # Pull the other reports doc = BeautifulSoup(utils.download(OTHER_REPORT_URL)) results = doc.select("#subContainer ul li") if not results: raise inspector.NoReportsFoundError("NASA (other)") for result in results: report = other_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # # Pull the RSS feed doc = BeautifulSoup(utils.download(RSS_URL)) results = doc.select("item") for result in results: report = rss_report_from(result, year_range) if report: inspector.save_report(report) # # Pull the recent audit reports. doc = BeautifulSoup(utils.download(RECENT_AUDITS_URL)) results = doc.select("div.block > a") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the archive audit reports doc = BeautifulSoup(utils.download(AUDIT_ARCHIVE_URL)) results = doc.select("div.block a") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the other reports doc = BeautifulSoup(utils.download(OTHER_REPORTS_URl)) results = doc.select("div.block > a") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2005: # This is the earliest audits go back continue url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(url)) results = doc.select("div.content") if not results: raise inspector.NoReportsFoundError("Tennessee Valley Authority (%d)" % year) for result in results: report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("report") if not results: raise inspector.NoReportsFoundError("Tennessee Valley Authority (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports with pagination for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items(): for page in range(0, 999): url = report_url_format.format(page=page) doc = BeautifulSoup(utils.download(url)) results = doc.select("li.views-row") if not results: if page == 0: raise inspector.NoReportsFoundError("USAID (%s)" % report_type) else: break for result in results: report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report) # Pull the semiannual reports (no pagination) doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("li.views-row") if not results: raise inspector.NoReportsFoundError("USAID (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the general reports doc = BeautifulSoup(utils.download(REPORTS_URL)) results = doc.select("div#mainContent li.mainContenttext a") for result in results: report = report_from(result, REPORTS_URL, year_range) if report: inspector.save_report(report) # Pull the archive reports doc = BeautifulSoup(utils.download(REPORT_ARCHIVE_URL)) results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a") for result in results: if not result.text: continue report = report_from(result, REPORT_ARCHIVE_URL, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div#mainContent li.mainContenttext a") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2002: # The oldest page for audit reports continue doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(year=year))) results = doc.select("div.content table tr") for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, report_type="audit", year_range=year_range) if report: inspector.save_report(report) # Pull the FOIA reports doc = BeautifulSoup(utils.download(FOIA_REPORTS_URL)) results = doc.select("div.content table tr") for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, report_type="other", year_range=year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div.content a") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options) # Pull the audit reports for year in year_range: url = audit_report_url(year) if url: parse_result_from_js_url(url, "auditreports", year, year_range) url = inspection_report_url(year) if url: parse_result_from_js_url(url, "iereports", year, year_range) # Pull the congressional testimony doc = BeautifulSoup(utils.download(CONGRESSIONAL_TESTIMONY_REPORTS_URL)) results = doc.findAll("ul", type='disc')[0].select("li") for result in results: report = congressional_testimony_report_from(result, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.findAll("ul", type='disc')[0].select("li") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def urls_for(self): only = self.options.get('topics') if only: # if only... only = set(only.split(',')) only = [(o, TOPIC_TO_REPORT_TYPE[o]) if o in TOPIC_TO_REPORT_TYPE else o for o in only] yield from self.urls_for_topics(only) # If there are topics selected, ONLY yield URLs for those. return # First yield the URLs for the topics that are tangential to the main # Calendar Year reports. yield from self.urls_for_topics(ADDITIONAL_TOPICS) # Not getting reports from specific topics, iterate over all Calendar Year # reports. page = BeautifulSoup(utils.download(BASE_URL)) # Iterate over each "Calendar Year XXXX" link for li in page.select('.field-items li'): md = RE_CALENDAR_YEAR.search(li.text) if md: cur_year = int(md.group(1)) if cur_year >= self.year_range[0] and cur_year <= self.year_range[-1]: href = li.select('a')[0]['href'] next_url = urljoin(BASE_URL, href) # The first page of reports is yielded. yield next_url # Next, read all the pagination links for the page and yield those. So # far, I haven't seen a page that doesn't have all of the following # pages enumerated. next_page = BeautifulSoup(utils.download(next_url)) for link in next_page.select('li.pager-item a'): yield urljoin(BASE_URL, link['href'])
def run(options): year_range = inspector.year_range(options, archive) doc = BeautifulSoup(utils.download(REPORTS_URL)) # Pull the semiannual reports semiannul_results = doc.select("#AnnualManagementReports select")[0] for result in semiannul_results.select("option"): report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the special reports special_report_table = doc.find("table", attrs={"bordercolor": "#808080"}) for index, result in enumerate(special_report_table.select("tr")): if not index: # Skip the header row continue report = report_from(result, REPORTS_URL, report_type='other', year_range=year_range) if report: inspector.save_report(report) # Pull the audit reports for year in year_range: if year < 2001: # The oldest fiscal year page available continue year_url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(year_url)) for index, result in enumerate(doc.select("#main table tr")): if not index: # Skip the header row continue report = report_from(result, year_url, report_type='audit', year_range=year_range) if report: inspector.save_report(report)
def extract_reports_for_oei(year_range): topic_name = TOPIC_NAMES["OE"] topic_url = TOPIC_TO_URL["OE"] root_body = utils.download(topic_url) root_doc = BeautifulSoup(root_body) letter_urls = set() for link in root_doc.select("#leftContentInterior li a"): absolute_url = urljoin(topic_url, link['href']) absolute_url = strip_url_fragment(absolute_url) letter_urls.add(absolute_url) if not letter_urls: raise inspector.NoReportsFoundError("HHS (OEI first pass)") all_results_links = {} all_results_unreleased = [] for letter_url in letter_urls: letter_body = utils.download(letter_url) letter_doc = BeautifulSoup(letter_body) results = letter_doc.select("#leftContentInterior ul li") if not results: raise inspector.NoReportsFoundError("HHS (OEI %s)" % letter_url) for result in results: if 'crossref' in result.parent.parent.attrs.get('class', []): continue if result.parent.parent.attrs.get('id') == 'related': continue node = result while node and node.name != "h2": node = node.previous if node and node.name == "h2": subtopic_name = str(node.text) else: subtopic_name = "(unknown)" links = result.findAll("a") if len(links) == 0: result.extract() all_results_unreleased.append([result, subtopic_name]) else: url = links[0].get("href") if url not in all_results_links: result.extract() all_results_links[url] = [result, subtopic_name] else: existing_result = all_results_links[url][0] for temp in result.contents: temp.extract() existing_result.append(temp) all_results_links[url][1] = "%s, %s" % (all_results_links[url][1], subtopic_name) subtopic_url = TOPIC_TO_URL["OE"] for result, subtopic_name in itertools.chain(all_results_links.values(), all_results_unreleased): report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL)) results = doc.select("td.text table tr") if not results: raise inspector.NoReportsFoundError("National Science Foundation (audit reports") for result in results: # ignore divider lines if result.select("img"): continue report = report_from(result, report_type='audit', year_range=year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("td.text table tr") if not results: raise inspector.NoReportsFoundError("National Science Foundation (semiannual reports)") for result in results: if not result.text.strip(): continue report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the case reports response = utils.scraper.post( url=CASE_REPORTS_URL, data=CASE_REPORTS_DATA, ) doc = BeautifulSoup(response.content) results = doc.select("td.text table tr") if not results: raise inspector.NoReportsFoundError("National Science Foundation (case reports)") for index, result in enumerate(results): if not index or not result.text.strip(): # Skip the header row and empty rows continue report = case_report_from(result, CASE_REPORTS_URL, year_range) if report: inspector.save_report(report) # Pull the testimony doc = BeautifulSoup(utils.download(TESTIMONY_REPORTS_URL)) results = doc.select("td.text table tr") if not results: raise inspector.NoReportsFoundError("National Science Foundation (testimony)") for result in results: if not result.text.strip(): continue report = report_from(result, report_type='testimony', year_range=year_range) if report: inspector.save_report(report)
def handle_scanner_args(args, opts) -> Tuple[dict, list]: """ --analytics: file path or URL to a CSV of participating domains. This function also handles checking for the existence of the file, downloading it succesfully, and reading the file in order to populate the list of analytics domains. """ parser = scan_utils.ArgumentParser(prefix_chars="--") parser.add_argument("--analytics", nargs=1, required=True) parsed, unknown = parser.parse_known_args(args) dicted = vars(parsed) should_be_single = ["analytics"] dicted = scan_utils.make_values_single(dicted, should_be_single) resource = dicted.get("analytics") if not resource.endswith(".csv"): no_csv = "".join([ "--analytics should be the file path or URL to a CSV of participating", " domains and end with .csv, which '%s' does not" % resource ]) logging.error(no_csv) raise argparse.ArgumentTypeError(no_csv) try: parsed_url = urlparse(resource) except: raise if parsed_url.scheme and parsed_url.scheme in ("http", "https"): analytics_path = Path(opts["_"]["cache_dir"], "analytics.csv").resolve() try: utils.download(resource, str(analytics_path)) except: logging.error(utils.format_last_exception()) no_csv = "--analytics URL %s not downloaded successfully." % resource logging.error(no_csv) raise argparse.ArgumentTypeError(no_csv) else: if not os.path.exists(resource): no_csv = "--analytics file %s not found." % resource logging.error(no_csv) raise FileNotFoundError(no_csv) else: analytics_path = resource analytics_domains = utils.load_domains(analytics_path) dicted["analytics_domains"] = analytics_domains del dicted["analytics"] return (dicted, unknown)
def run(options): year_range = inspector.year_range(options) only_id = options.get('report_id') print("## Downloading reports from %i to %i" % (year_range[0], year_range[-1])) url = url_for() body = utils.download(url) doc = BeautifulSoup(body) results = doc.select("section") for result in results: try: year = int(result.get("title")) # check that the fetched year is in the range if year not in year_range: continue print("## Downloading year %i " % year) except ValueError: continue # gets each table entry and sends generates a report from it listings = result.div.table.tbody.contents for item in listings: if type(item) is not bs4.element.Tag: continue report = report_from(item) # can limit it to just one report, for debugging convenience if only_id and only_id != report['report_id']: continue inspector.save_report(report)
def fetch_from_landing_page(self, landing_url): """Returns a tuple of (pdf_link, summary_text, is_unreleased).""" unreleased = False page = BeautifulSoup(utils.download(landing_url)) summary = None field_items = page.select('.field-items') if field_items: text = [node.strip() for node in field_items[0].findAll(text=True)] summary = '\n\n'.join(text).strip() if not summary: logging.info('\tno summary text found') if (summary and (RE_NOT_AVAILABLE.search(summary) or RE_NOT_AVAILABLE_2.search(summary) or RE_NOT_AVAILABLE_3.search(summary) or RE_NOT_AVAILABLE_4.search(summary) or RE_CLASSIFIED.search(summary))): unreleased = True report_url = None pdf_link = page.select('.file a') if not pdf_link: logging.warn('No pdf link found on page: {0}'.format(landing_url)) else: report_url = pdf_link[0]['href'] return report_url, summary, unreleased
def run(options): year_range = inspector.year_range(options, archive) report_flag = False # Pull the table of reports for each year for year in year_range: url = url_for_year(year) html = utils.download(url, scraper_slug="osc") if html is None: if year == max(year_range): continue else: raise Exception("Couldn't fetch reports page {}".format(url)) # spaces appear as   and \u200b .... fix that now html = html.replace(' ', ' ').replace('\u200b', ' ').replace('\u00a0', ' ').replace('\r', '').replace('\n', '') doc = BeautifulSoup(html, "lxml") OUTCOME_CODES = generate_outcome_codes(doc) keys_used = [] # a few reports appear multiple times... ignore them the second time if they appear more than once results = doc.findAll("table")[1].tbody.findAll('tr') # no ids on the tables, but it's the second one for result in results: reports = report_from(result, year, year_range, url, OUTCOME_CODES) for report in reports: if report['report_id'] not in keys_used: inspector.save_report(report) keys_used.append(report['report_id']) report_flag = True if not report_flag: raise inspector.NoReportsFoundError("OSC")
def run(options): year_range = inspector.year_range(options) topics = options.get('topics') if topics: topics = topics.split(",") else: topics = TOPIC_TO_URL.keys() for topic in topics: topic_url = TOPIC_TO_URL[topic] body = utils.download(topic_url) doc = BeautifulSoup(body) try: year_results = doc.select("#Listing")[0] results = [x for x in year_results.select("ul li ul li")] except IndexError: try: all_results = doc.select("#bodyholder")[0] results = [x for x in all_results.select("ul li")] except IndexError: results = doc.select("table ul li") # Sometimes multiple reports are listed under the same datetime element. # We store which published datetime we saw last so that the next report # can use if if we are unable to find another published time. last_published_on = None for result in results: report, last_published_on = report_from(result, topic_url, topic, year_range, last_published_on) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options) for page_url in URLS: done = False body = utils.download(page_url) doc = BeautifulSoup(body) maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0] all_p = maincontent.find_all("p") for p in all_p: for all_text, link_text, link_url in recurse_tree(p, False): if link_url == None: continue if link_url.startswith("mailto:"): continue if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm": # end of page done = True break if link_url.startswith("https://public.govdelivery.com/"): continue for index_url in URLS: if index_url.find(link_url) != -1: continue year = DATE_RE.search(all_text).group(3) if int(year) not in year_range: continue report = report_from(all_text, link_text, link_url, page_url) inspector.save_report(report) if done: break
def run(options): year_range = inspector.year_range(options, archive) component = options.get('component') if component: components = [component] else: components = list(COMPONENTS.keys()) report_id = options.get('report_id') limit = int(options.get('limit', 0)) all_audit_reports = {} for component in components: logging.info("## Fetching reports for component %s" % component) url = url_for(options, component) body = utils.download(url) doc = BeautifulSoup(body) results = doc.select("table.contentpaneopen table[border=1] tr") # accept only trs that look like body tr's (no 'align' attribute) # note: HTML is very inconsistent. cannot rely on thead or tbody results = [x for x in results if x.get('align') is None] if not results: raise inspector.NoReportsFoundError("DHS (%s)" % component) count = 0 for result in results: report = report_from(result, component, url) if not report: continue if report_id and (report_id != report['report_id']): continue if inspector.year_from(report) not in year_range: # logging.info("[%s] Skipping, not in requested range." % report['report_id']) continue key = (report["report_id"], report["title"]) if key in all_audit_reports: all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \ ", " + report["agency"] all_audit_reports[key]["agency_name"] = \ all_audit_reports[key]["agency_name"] + ", " + \ report["agency_name"] else: all_audit_reports[key] = report count += 1 if limit and (count >= limit): break logging.info("## Fetched %i reports for component %s\n\n" % (count, component)) for report in all_audit_reports.values(): inspector.save_report(report)
def run(options): year_range = inspector.year_range(options) doc = BeautifulSoup(utils.download(REPORTS_URL)) # Pull the audit reports audit_header = doc.find("a", attrs={"name": 'Audit Reports'}) audit_list1 = audit_header.find_next("ul").select("li") # They have two separate uls for these reports. See note to the IG web team. audit_list2 = audit_header.find_next("ul").find_next("ul").select("li") results = audit_list1 + audit_list2 for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the inspection reports inspections_header = doc.find("a", attrs={"name": 'Inspection Reports'}) results = inspections_header.find_next("ul").select("li") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the semiannual reports semiannual_header = doc.find("a", attrs={"name": 'Semiannual Reports'}) results = semiannual_header.find_next("ul").select("li") for result in results: report = report_from(result, year_range, title_prefix="Semiannual Report - ") if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Find the number of pages to iterate doc = BeautifulSoup(utils.download(REPORTS_URL)) page_count_text = doc.select("div.AspNet-GridView-Pagination")[0].text page_count = int(re.search("Page 1 of (\d+)", page_count_text).groups()[0]) # Iterate over those pages for page in range(1, page_count + 1): response = utils.scraper.post( REPORTS_URL, data={ "__EVENTTARGET": "ctl00$ctl00$MainContent$NavTreeSubContent$sv$GridViewSummary", "__EVENTARGUMENT": "Page${page_number}".format(page_number=page), }, cookies=COOKIES, ) doc = BeautifulSoup(response.content) results = doc.select("div.AspNet-GridView table tr") if not results: break for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options) pages = options.get('pages', ALL_PAGES) max_page = None for page in range(1, (int(pages) + 1)): if max_page and (page > max_page): print("End of pages!") break print("## Downloading page %i" % page) url = url_for(options, page) body = utils.download(url) doc = BeautifulSoup(body) max_page = last_page_for(doc) results = doc.select(".views-row") for result in results: report = report_from(result) # inefficient enforcement of --year arg, USPS doesn't support it server-side # TODO: change to published_on.year once it's a datetime if inspector.year_from(report) not in year_range: print("[%s] Skipping report, not in requested range." % report['report_id']) continue inspector.save_report(report)
def run(options): year_range = inspector.year_range(options) max_pages = int(options.get('pages', 1)) for year in year_range: page = 1 done = False while not done: url = url_for(options, page, year) body = utils.download(url) doc = BeautifulSoup(body) next_page = page + 1 found_next_page = False page_links = doc.select("li.pager-item a.active") for page_link in page_links: if page_link.text == str(next_page): found_next_page = True break if not found_next_page: done = True if next_page > max_pages: done = True results = doc.select("table.views-table > tbody > tr") for result in results: report = report_from(result) inspector.save_report(report) page = next_page if not done: print('Moving to next page (%d)' % page)
def run(options): year_range = inspector.year_range(options, archive) topics = options.get('topics') if topics: topics = topics.split(",") else: topics = TOPIC_TO_URL.keys() all_reports = {} for topic in topics: year_urls = urls_for(year_range, topic) for year_url in year_urls: logging.debug("Scraping %s" % year_url) body = utils.download(year_url) doc = BeautifulSoup(body) if not doc.select(".view-business-areas"): raise inspector.NoReportsFoundError("DOT (%s)" % topic) results = doc.select(".view-business-areas .views-row") for result in results: report = report_from(result, year_range, topic, options) if report: report_id = report["report_id"] if report_id in all_reports: all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \ + ", " + topic else: all_reports[report_id] = report for report in all_reports.values(): inspector.save_report(report)
def fetch_from_landing_page(landing_url): """Returns a tuple of (pdf_link, summary_text).""" add_pdf = False body = utils.download(landing_url) page = BeautifulSoup(body) link = page.find('a', text=RE_PDF_LINK_TEXT, href=RE_PDF_HREF) if not link: link = page.find('a', text=RE_PDF_CLICK_TEXT, href=RE_PDF_HREF) if not link: link = page.find('a', text=RE_PDF_SARC_TEXT, href=RE_PDF_HREF) # cases where .pdf is left off, ugh, e.g. # http://www.dodig.mil/pubs/report_summary.cfm?id=849 if not link: link = page.find('a', text=RE_PDF_LINK_TEXT, href=RE_BACKUP_PDF_HREF) add_pdf = True href = link['href'].strip() if link else None if href and add_pdf: href = href + ".pdf" summary = None text_tr = page.select('tr[valign="top"] td') if text_tr: text = [node.strip() for node in text_tr[0].findAll(text=True)] summary = '\n\n'.join(text) if not summary: logging.info('\tno summary text found') return (href, summary)
def extract_reports_for_subtopic(subtopic_url, year_range, topic, subtopic=None): if subtopic_url.startswith("http://httphttp://"): # See notes to IG's web team subtopic_url = subtopic_url.replace("http://http", "") body = utils.download(subtopic_url) doc = BeautifulSoup(body) results = doc.select("#body-row02-col02andcol03 a") if not results: results = doc.select("#body-row02-col01andcol02andcol03 a") if not results and "There are currently no reports in this category" not in doc.text: raise AssertionError("No report links found for %s" % subtopic_url) topic_name = TOPIC_NAMES[topic] # Broadcasting Board of Governors is a fully independent agency if topic == 'BBG' or subtopic == 'Broadcasting Board of Governors': agency = 'bbg' else: agency = 'state' for result in results: report = report_from(result, year_range, agency, topic_name, subtopic) if report: inspector.save_report(report)
def semiannual_report_from(result, year_range): link = result.find("a") title = link.text # Parse the report title. Ex: # 'OIG Semiannual Report to the Congress: October 1, 2013 - March 31, 2014 (incl. MCC)' published_on_text = title.split("-")[-1].split("–")[-1].split("(")[0].strip() published_on_text = published_on_text.replace("September 31", "September 30") # See note to IG Web team published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % title) return landing_url = urljoin(SEMIANNUAL_REPORTS_URL, link.get('href')) landing_page = BeautifulSoup(utils.download(landing_url)) report_url = landing_page.select("div.filefield-file a")[0].get('href') report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) report = { 'inspector': "usaid", 'inspector_url': "https://oig.usaid.gov", 'agency': "usaid", 'agency_name': "Agency For International Development", 'type': 'semiannual_report', 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def run(options): year_range = inspector.year_range(options, archive) pages = options.get('pages', ALL_PAGES) # default to starting at page 1 begin = int(options.get('begin', 1)) max_page = None for page in range(begin, (int(pages) + 1)): if max_page and (page > max_page): logging.debug("End of pages!") break logging.debug("## Downloading page %i" % page) url = url_for(options, page) body = utils.download(url) doc = BeautifulSoup(body) # When the USPS restores their page controls, we can use this again, # which saves one network call each time. max_page = last_page_for(doc) results = doc.select(".views-row") for result in results: report = report_from(result) # inefficient enforcement of --year arg, USPS doesn't support it server-side # TODO: change to published_on.year once it's a datetime if inspector.year_from(report) not in year_range: logging.warn("[%s] Skipping report, not in requested range." % report['report_id']) continue inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports for report_type, report_url in REPORT_URLS.items(): doc = BeautifulSoup(utils.download(report_url)) results = doc.select("td.mainInner div.ms-WPBody li") if not results: raise inspector.NoReportsFoundError("SIGTARP (%s)" % report_url) for result in results: report = report_from(result, report_type, year_range) if report: inspector.save_report(report)
def beautifulsoup_from_url(url): body = utils.download(url) if body is None: return None doc = BeautifulSoup(body) # Some of the pages will return meta refreshes if doc.find("meta") and doc.find("meta").attrs.get( 'http-equiv') == 'REFRESH': redirect_url = urljoin( url, doc.find("meta").attrs['content'].split("url=")[1]) return beautifulsoup_from_url(redirect_url) else: return doc
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports doc = BeautifulSoup(utils.download(REPORTS_URL)) results = doc.find("table", {"cellpadding": "5"}).select("tr") if not results: raise inspector.NoReportsFoundError("FDIC") for index, result in enumerate(results): if index < 3 or not result.text.strip(): # The first three rows are headers continue report = report_from(result, year_range) if report: inspector.save_report(report)
def parse_result_from_js_url(url, format_slug, year, year_range, report_type): """ Given a link to a javascript file that has report data, add all of the reports """ body = utils.download(url, scraper_slug="tigta") # Pulling out javascript array values that look like: # arrid[0]=new AR("200720002","Stronger Management Oversight Is Required to Ensure Valuable Systems Modernization Expertise Is Received From the Federally Funded Research and Development Center Contractor","20061020","01",2,0,0,0); # Look in https://www.treasury.gov/tigta/oa_auditreports_fy14.js for some more examples. results = re.findall('arrid\[\d+\]=new AR\((.*)\);', body) for result in results: report = report_from(result, format_slug, year, year_range, report_type) if report: inspector.save_report(report)
def get_pagination_urls(page): """Find the pagination links on the page and yield them all. This method recursively downloads new pages in the case that there are more than 10. """ for link in page.select('a'): if 'href' not in link: continue if link['href'].startswith('?') and RE_DIGITS.match(link.text): yield BASE_URL + link['href'] elif link['href'].startswith('/pubs') and RE_NEXT_10.search(link.text): new_url = urljoin(BASE_URL, link['href']) page = BeautifulSoup(utils.download(new_url)) for link in get_pagination_urls(page): yield link
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports for report_type, url in REPORT_TYPE_MAP.items(): doc = BeautifulSoup(utils.download(url)) results = doc.select("div#content div#contentMain ul li.pdf") if not results: raise inspector.NoReportsFoundError("CPB (%s)" % url) for result in results: if not result.find('a'): # Skip unlinked PDF's continue report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report)
def get_subtopic_map(topic_url): body = utils.download(topic_url) doc = BeautifulSoup(body) subtopic_map = {} for link in doc.select("#leftContentInterior li a"): absolute_url = urljoin(topic_url, link['href']) absolute_url = strip_url_fragment(absolute_url) # Only add new URLs if absolute_url not in subtopic_map.values(): subtopic_map[link.text] = absolute_url if not subtopic_map: raise inspector.NoReportsFoundError("OEI (subtopics)") return subtopic_map
def reports_from_page(url_format, page, report_type, year_range, year=''): url = url_format.format(page=page, year=year) doc = BeautifulSoup(utils.download(url)) results = doc.select("td.views-field") if not results: results = doc.select("div.views-row") if not results: return False for result in results: if not result.text.strip(): # Skip empty rows continue report = report_from(result, report_type, year_range) if report: inspector.save_report(report) return True
def run(options): year_range = inspector.year_range(options, archive) doc = BeautifulSoup(utils.download(REPORTS_URL)) results = None for section in doc.find_all("section"): if section.h4 and section.h4.text.strip() == "Publications": results = section.find_all("a") break if not results: raise inspector.NoReportsFoundError("Denali Commission") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report)
def semiannual_report_from(result, year_range): # This will look like "toggleReport('SARC-47-49');" and we want to pull out # the SARC-47-49 report_id_javascript = result.get('onclick') report_id = re.search("'(.*)'", report_id_javascript).groups()[0] landing_url = "http://oig.pbgc.gov/sarc/{report_id}.html".format( report_id=report_id) landing_page = BeautifulSoup(utils.download(landing_url)) title = " ".join(landing_page.select("h3")[0].text.split()) relative_report_url = landing_page.find( "a", text="Read Full Report").get('href') # The relative report urls try to go up a level too many. Most browsers seem # to just ignore this so we will too. relative_report_url = relative_report_url.replace("../", "", 1) report_url = urljoin(SEMIANNUAL_REPORTS_URL, relative_report_url) # There is probably a way to be a bit smarter about this summary = landing_page.text.strip() published_on_text = title.rsplit("-")[-1].rsplit("through")[-1].replace( ".", "").strip() published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % title) return report = { 'inspector': "pbgc", 'inspector_url': "http://oig.pbgc.gov", 'agency': "pbgc", 'agency_name': "Pension Benefit Guaranty Corporation", 'type': 'semiannual_report', 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if summary: report['summary'] = summary if landing_url: report['landing_url'] = landing_url return report
def extract_from_release_page(landing_url): doc = BeautifulSoup(utils.download(landing_url)) main = doc.select("#main #lefSide")[0] url_elem = main.select("div")[2].select("a") if url_elem: url = urljoin(landing_url, url_elem[0]['href']) else: url = None summary = "" for p in main.select("p"): summary += p.text + "\n\n" # will only be used if the title isn't present on the listing title = main.select("h2")[0].text.strip() return (url, summary.strip(), title)
def run(options): year_range = inspector.year_range(options, archive) for index in INDEX_URLS: report_count = 0 for year in year_range: url = url_for(options, index, year) body = utils.download(url) doc = BeautifulSoup(body) results = doc.select("div.view-content div.views-row") for result in results: report = report_from(result) inspector.save_report(report) report_count = report_count + 1 if report_count == 0: raise inspector.NoReportsFoundError("Amtrak (%s)" % index.split("/")[-1])
def run(options): year_range = inspector.year_range(options, archive) last_page = options.get("end") start = int(options.get("start", 1)) # Pull the reports for (reports_page, report_type) in REPORTS_URLS: page = start last_page = options.get("end") # reset for each area while True: url = url_for(reports_page, page) doc = BeautifulSoup(utils.download(url)) if last_page is None: last_page = last_page_from(doc) if report_type == "case": results = doc.select("div#main div.grayBox2") else: results = doc.select("div#main div.whiteBox") if results: for result in results: report = report_from(result, reports_page, report_type, year_range) if report: inspector.save_report(report) elif report_type != "case": raise inspector.NoReportsFoundError("CNCS (%s)" % url) # closed cases have broken pagination (p6, 7, 8 missing) so ignore else: pass if int(page) >= int(last_page): break else: page += 1 # one hardcoded peer review, just always do it inspector.save_report(do_peer_review())
def run(options): year_range = inspector.year_range(options, archive) only_id = options.get('report_id') logging.info("## Downloading reports from %i to %i" % (year_range[0], year_range[-1])) url = url_for() body = utils.download(url) doc = BeautifulSoup(body) results = doc.select("section") if not results: raise inspector.NoReportsFoundError("OPM") for result in results: try: year = int(result.get("title")) # check that the fetched year is in the range if year not in year_range: continue logging.info("## Downloading year %i " % year) except ValueError: continue # gets each table entry and sends generates a report from it listings = result.div.table.tbody.contents for item in listings: if type(item) is not bs4.element.Tag: continue report = report_from(item) if report['report_id'] in BLACKLIST: logging.warn( "Skipping downed report: remember to report this and get it fixed!" ) continue # can limit it to just one report, for debugging convenience if only_id and only_id != report['report_id']: continue inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) only = options.get('topics') if only: only = set(only.split(',')) else: only = ALL_TOPIC_AREAS index_body = utils.download(BASE_URL) current_year = None index = BeautifulSoup(index_body) tables = index.select('table.style1') if not tables: raise inspector.NoReportsFoundException("EPA") for table in tables: trs = table.select('tr') for tr in trs: tds = tr.select('td') if len(tds) < 8: if len(tds) == 1: # Large column that indicates year col_links = tds[0].select('a') if len(col_links) == 1: col_text = col_links[0].text if RE_YEAR.match(col_text): current_year = col_text continue published_on_dt = datetime.datetime.strptime( tds[6].text, '%m/%d/%Y') if published_on_dt.year not in year_range: continue topic_areas = set(tds[7].text.split(', ')) if not len(topic_areas.intersection(only)): continue report = report_from(tds, published_on_dt, current_year) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) topics = options.get('topics') if topics: topics = topics.split(",") else: topics = TOPIC_TO_URL.keys() for topic in topics: topic_url = TOPIC_TO_URL[topic] body = utils.download(topic_url) doc = BeautifulSoup(body) try: year_results = doc.select("#Listing")[0] results = [x for x in year_results.select("ul li ul li")] except IndexError: try: all_results = doc.select("#bodyholder")[0] results = [x for x in all_results.select("ul li")] except IndexError: results = doc.select("table ul li") if not results: raise inspector.NoReportsFoundError("SEC (%s)" % topic) # Sometimes multiple reports are listed under the same datetime element. # We store which published datetime we saw last so that the next report # can use if if we are unable to find another published time. last_published_on = None for result in results: report, last_published_on = report_from(result, topic_url, topic, year_range, last_published_on) if report: inspector.save_report(report) for canned_report in CANNED_REPORTS: report_datetime = datetime.datetime.strptime( canned_report["published_on"], "%Y-%m-%d") if report_datetime.year in year_range: add_common_fields(canned_report) inspector.save_report(canned_report)
def urls_for(options, only): year_range = inspector.year_range(options, archive) for office in only: # there's always a first year, and it defaults to current year params = {} params['searchdate1'] = '01/01/%s' % year_range[0] params['searchdate2'] = '12/31/%s' % year_range[-1] # could be the same year params['office'] = OFFICES[office] params['sort'] = 'report_number' params['order'] = 'desc' query_string = urlencode(params) url = '{0}?{1}'.format(BASE_URL, query_string) yield url body = utils.download(url) page = BeautifulSoup(body) for url in get_pagination_urls(page): yield url
def run(options): only = options.get('topics') if only: only = set(only.split(',')) else: # Default to all offices, whee! only = list(OFFICES.keys()) for url in urls_for(options, only): body = utils.download(url) page = BeautifulSoup(body) report_table = page.select('table[summary~="reports"]')[0] for tr in report_table.select('tr')[1:]: tds = tr.select('td') if len(tds) == 1: # Page has no reports, simply a "No Data" indication for these dates. break report = report_from(tds, options) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) doc = BeautifulSoup(utils.download(REPORTS_URL)) # Pull the audit reports audit_header = doc.find("a", attrs={"name": 'Audit Reports'}) audit_list1 = audit_header.find_next("ul").select("li") # They have two separate uls for these reports. See note to the IG web team. audit_list2 = audit_header.find_next("ul").find_next("ul").select("li") results = audit_list1 + audit_list2 if not results: raise inspector.NoReportsFoundError("FEC (audit reports)") for result in results: report = report_from(result, year_range, report_type='audit') if report: inspector.save_report(report) # Pull the inspection reports inspections_header = doc.find("a", attrs={"name": 'Inspection Reports'}) results = inspections_header.find_next("ul").select("li") if not results: raise inspector.NoReportsFoundError("FEC (inspection reports)") for result in results: report = report_from(result, year_range, report_type='inspection') if report: inspector.save_report(report) # Pull the semiannual reports semiannual_header = doc.find("a", attrs={"name": 'Semiannual Reports'}) results = semiannual_header.find_next("ul").select("li") if not results: raise inspector.NoReportsFoundError("FEC (semiannual reports)") for result in results: report = report_from(result, year_range, report_type='semiannual_report', title_prefix="Semiannual Report - ") if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports for report_type, url in REPORT_URLS.items(): doc = BeautifulSoup(utils.download(url)) results = doc.select("div.section1 div.ltext > table tr") if not results: results = doc.select( "td.three-col-layout-middle div.ltext > table tr") if not results: raise inspector.NoReportsFoundError( "Government Publishing Office (%s)" % url) for result in results: if (not result.text.strip() or result.find("th") or result.find("strong") or result.contents[1].text in HEADER_TITLES): # Skip header rows continue report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) max_pages = options.get('pages', None) if max_pages: max_pages = int(max_pages) for year in year_range: page = 1 done = False while not done: url = url_for(options, page, year) body = utils.download(url) doc = BeautifulSoup(body) next_page = page + 1 found_next_page = False page_links = doc.select("li.pager-item a.active") for page_link in page_links: if page_link.text == str(next_page): found_next_page = True break if not found_next_page: done = True if max_pages and (next_page > max_pages): done = True results = doc.select("table.views-table > tbody > tr") if not results: raise inspector.NoReportsFoundError("Amtrak") for result in results: report = report_from(result) inspector.save_report(report) page = next_page if not done: logging.info('Moving to next page (%d)' % page)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL)) headers = doc.select("p.Ptitle1") if not headers: raise inspector.NoReportsFoundError("ITC") for header in headers: year = int(header.text.strip()) results = header.findNextSibling("ul").select("li") for result in results: if not inspector.sanitize(result.text): logging.debug("Skipping empty list item.") continue report = audit_report_from(year, result, AUDIT_REPORTS_URL, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = BeautifulSoup(utils.download(REPORTS_URL)) results = doc.select("ul.text > ul > li") if not results: raise inspector.NoReportsFoundError("CFTC audit reports") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the semiannual reports results = doc.select("ul.text td a") if not results: raise inspector.NoReportsFoundError("CFTC semiannual reports") for result in results: report = report_from(result, year_range, report_type="semiannual_report") if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) only_report_id = options.get('report_id') # Pull the reports for report_type, url in REPORT_URLS.items(): doc = BeautifulSoup(utils.download(url)) results = doc.select("div.field-item li") if not results: results = doc.select("div.field-item tr") if not results: raise inspector.NoReportsFoundError( "National Endowment for the Arts (%s)" % report_type) for result in results: report = report_from(result, url, report_type, year_range) if report: # debugging convenience: can limit to single report if only_report_id and (report['report_id'] != only_report_id): continue inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) report_flag = False # Pull the table of reports for each year for year in year_range: url = url_for_year(year) html = utils.download(url, scraper_slug="osc") if html is None: if year == max(year_range): continue else: raise Exception("Couldn't fetch reports page {}".format(url)) # spaces appear as   and \u200b .... fix that now html = html.replace(' ', ' ').replace('\u200b', ' ').replace( '\u00a0', ' ').replace('\r', '').replace('\n', '') doc = BeautifulSoup(html, "lxml") OUTCOME_CODES = generate_outcome_codes(doc) keys_used = [ ] # a few reports appear multiple times... ignore them the second time if they appear more than once results = doc.findAll("table")[1].tbody.findAll( 'tr') # no ids on the tables, but it's the second one for result in results: reports = report_from(result, year, year_range, url, OUTCOME_CODES) for report in reports: if report['report_id'] not in keys_used: inspector.save_report(report) keys_used.append(report['report_id']) report_flag = True if not report_flag: raise inspector.NoReportsFoundError("OSC")
def run(options): year_range = inspector.year_range(options, archive) pages = options.get('pages', ALL_PAGES) # default to starting at page 1 begin = int(options.get('begin', 1)) max_page = None for page in range(begin, (int(pages) + 1)): if max_page and (page > max_page): logging.debug("End of pages!") break logging.debug("## Downloading page %i" % page) url = url_for(options, page) body = utils.download(url) doc = BeautifulSoup(body) # When the USPS restores their page controls, we can use this again, # which saves one network call each time. max_page = last_page_for(doc) results = doc.select(".views-row") if not results: raise inspector.NoReportsFoundError("USPS") for result in results: report = report_from(result) # inefficient enforcement of --year arg, USPS doesn't support it server-side # TODO: change to published_on.year once it's a datetime if inspector.year_from(report) not in year_range: logging.warn("[%s] Skipping report, not in requested range." % report['report_id']) continue inspector.save_report(report)
def report_from(result, report_type, year_range): landing_page_link = result.find("a") title = landing_page_link.text.strip() landing_url = urljoin(BASE_REPORT_URL, landing_page_link.get('href')) # Sometimes the last report on one page is also the first report on the next # page. Here, we skip any duplicate landing pages we've already saved. if landing_url in visited_landing_urls: return # This landing page is a duplicate of another one if landing_url == "http://oig.ssa.gov/physical-security-office-disability-" \ "adjudication-and-reviews-headquarters-building-limited-0": return published_on_text = result.select( "span.date-display-single")[0].text.strip() published_on = datetime.datetime.strptime(published_on_text, '%A, %B %d, %Y') if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % title) return try: report_id = result.select("span.field-data")[0].text.strip() except IndexError: report_id = landing_url.split("/")[-1] # This report has the wrong report number entered if landing_url == "http://oig.ssa.gov/audits-and-investigations/" \ "audit-reports/congressional-response-report-internet-claim-" \ "applications-0": report_id = "A-07-10-20166" landing_page = BeautifulSoup(utils.download(landing_url)) unreleased = False if "Limited Distribution" in title: unreleased = True report_url = None else: try: report_url = result.select("span.file a")[0].get('href') except IndexError: if not unreleased: try: report_url = landing_page.find( "a", attrs={ "type": 'application/octet-stream;' }).get('href') except AttributeError: unreleased = True report_url = None try: summary = landing_page.select( "div.field-type-text-with-summary")[0].text.strip() except IndexError: summary = None file_type = None if report_url: _, extension = os.path.splitext(report_url) if not extension: file_type = 'html' visited_landing_urls.add(landing_url) report = { 'inspector': "ssa", 'inspector_url': "http://oig.ssa.gov", 'agency': "ssa", 'agency_name': "Social Security Administration", 'type': report_type, 'landing_url': landing_url, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if unreleased: report['unreleased'] = unreleased if file_type: report['file_type'] = file_type if summary: report['summary'] = summary return report
def beautifulsoup_from_url(url): body = utils.download(url) return BeautifulSoup(body)
def run(options): year_range = inspector.year_range(options, archive) published_on = None for page_url in [ WHATS_NEW_URL, WHATS_NEW_ARCHIVE_URL, SEMIANNUAL_REPORTS_AND_TESTIMONIES_URL ]: body = utils.download(page_url) doc = BeautifulSoup(body) maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0] all_a = maincontent.find_all("a") if not all_a: raise inspector.NoReportsFoundException("Ex-Im Bank (%s)" % page_url) for a in all_a: a_text = str(a.text) if a_text.strip() == "": continue a_href = a.get("href") if a_href.startswith("mailto:"): continue if a_href.startswith("https://public.govdelivery.com/"): continue if page_url == WHATS_NEW_URL and a_href == "/oig/whats-new-archive.cfm": # end of page break if deduplicate_url(a_href): continue # Now, we want to grab all of the text associated with this link. # If there is just one link inside of a paragraph tag, we can take the # text contents of that paragraph tag. Otherwise, we use "previous" to # grab all the text that comes before the link. parent_p = a while parent_p.name != "p": parent_p = parent_p.parent links_in_parent = parent_p.find_all("a") links_in_parent = [link for link in links_in_parent \ if len(link.text.strip())] links_in_parent = set( [link.get("href") for link in links_in_parent]) if len(links_in_parent) == 1: all_text = parent_p.text else: all_text = a_text node = a.previous while True: if is_inside_link(node): break if isinstance(node, NavigableString): all_text = node + all_text node = node.previous if not node: break if node == maincontent: break # Response letters don't get their own date heading -- keep date from # last report and reuse in those cases temp = DATE_RE.search(all_text) if temp: # For semiannual reports to congress, use the second date from the text # Also, tack the date onto the report_id to disambiguate if page_url == SEMIANNUAL_REPORTS_AND_TESTIMONIES_URL and a_text.strip( ).startswith('Semiannual Report to Congress'): a_text = a_text.strip() + ' ' + temp.group(0) + ' - ' temp = DATE_RE.search(all_text, temp.end() + 1) a_text = a_text + temp.group(0) date_text = temp.group(0).replace('Sept ', 'Sep ') try: published_on = datetime.strptime(date_text, '%B %d, %Y') except ValueError: published_on = datetime.strptime(date_text, '%b %d, %Y') if (published_on is None) or (published_on.year not in year_range): continue report = report_from(all_text, a_text, a_href, page_url, published_on) inspector.save_report(report) for page_url in [PRESS_RELEASES_URL, PRESS_RELEASES_ARCHIVE_URL]: done = False body = utils.download(page_url) doc = BeautifulSoup(body) maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0] all_p = maincontent.find_all("p") for p in all_p: for all_text, link_text, link_url in recurse_tree(p, False): if link_url == None: continue if link_url.startswith("mailto:"): continue if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm": # end of page done = True break if link_url.startswith("https://public.govdelivery.com/"): continue for index_url in URLS: if index_url.find(link_url) != -1: continue if deduplicate_url(link_url): continue date_match = DATE_RE.search(all_text) try: date_text = date_match.group(0).replace('Sept ', 'Sep ') published_on = datetime.strptime(date_text, '%B %d, %Y') except ValueError: published_on = datetime.strptime(date_text, '%b %d, %Y') if published_on.year not in year_range: continue report = report_from(all_text, link_text, link_url, page_url, published_on) inspector.save_report(report) if done: break
def fetch_from_landing_page(landing_url): """Returns a tuple of (pdf_link, summary_text).""" add_pdf = False skip = False body = utils.download(landing_url) page = BeautifulSoup(body) report_tables = page.select('table[summary~="reports"]') # in the rare case that doesn't work, have faith if len(report_tables) == 0: report_tables = page.select('table') table = report_tables[0] examine_text = table.text maybe_unreleased = False if RE_OFFICIAL.search(examine_text) or RE_CLASSIFIED.search(examine_text) or RE_FOIA.search(examine_text) or RE_AFGHANISTAN.search(examine_text) or RE_RESTRICTED.search(examine_text) or RE_INTEL.search(examine_text): # 'Official use only' or 'Classified' materials don't have PDFs. Mark the # report metadata appropriately. maybe_unreleased = True # two varieties of normal report link link = page.find('a', text=RE_PDF_LINK_TEXT, href=RE_PDF_HREF) if not link: link = page.find('a', text=RE_PDF_CLICK_TEXT, href=RE_PDF_HREF) # Semi annual reports to Congress if not link: link = page.find('a', text=RE_PDF_SARC_TEXT, href=RE_PDF_HREF) # occurs for some multi-part reports, top/body/bottom if not link: link = page.find('a', text=RE_PDF_BODY_MAYBE, href=RE_PDF_HREF) # cases where .pdf is left off, ugh, e.g. # http://www.dodig.mil/pubs/report_summary.cfm?id=849 if not link: link = page.find('a', text=RE_PDF_LINK_TEXT, href=RE_BACKUP_PDF_HREF) if link: add_pdf = True # last resort, slow python-based check for tightest requirements if not link: link = page.find(pdf_test) # before accepting *any* PDF, check for skippable offenses if not link and (RE_EXTERNALLY_HOSTED.search(table.text) or RE_RESCINDED.search(table.text) or RE_RETRACTED.search(table.text) or RE_UNUSED.search(table.text)): skip = True # okay, I'll take *any* PDF if not link: link = table.find(any_pdf_test) href = link['href'].strip() if link else None if href and add_pdf: href = href + ".pdf" # some URLs have "/../" in the middle, and the redirects are trouble if href: href = href.replace("/../", "/") summary = None text_tr = page.select('tr[valign="top"] td') if text_tr: text = [node.strip() for node in text_tr[0].findAll(text=True)] summary = '\n\n'.join(text) if not summary: logging.info('\tno summary text found') return (href, summary, maybe_unreleased, skip)