Beispiel #1
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for year in year_range:
        report_type = 'audit'
        for page in range(0, ALL_PAGES):
            reports_found = reports_from_page(AUDIT_REPORTS_URL, page,
                                              report_type, year_range, year)
            if not reports_found:
                if page == 0:
                    raise inspector.NoReportsFoundError(
                        "Social Security Administration (%d)" % year)
                else:
                    break

    # Pull the other reports
    for report_type, report_format in OTHER_REPORT_URLS.items():
        for page in range(0, ALL_PAGES):
            reports_found = reports_from_page(report_format, page, report_type,
                                              year_range)
            if not reports_found:
                if page == 0:
                    raise inspector.NoReportsFoundError(
                        "Social Security Administration (%s)" % report_type)
                else:
                    break
Beispiel #2
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for year in year_range:
        if year < 2005:  # This is the earliest audits go back
            continue
        url = AUDIT_REPORTS_URL.format(year=year)
        doc = BeautifulSoup(utils.download(url))
        results = doc.select("div.content")
        if not results:
            raise inspector.NoReportsFoundError(
                "Tennessee Valley Authority (%d)" % year)
        for result in results:
            report = audit_report_from(result, url, year_range)
            if report:
                inspector.save_report(report)

    # Pull the semiannual reports
    doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
    results = doc.select("report")
    if not results:
        raise inspector.NoReportsFoundError(
            "Tennessee Valley Authority (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the general reports
  doc = utils.beautifulsoup_from_url(REPORTS_URL)
  results = doc.select("div#mainContent li.mainContenttext a")
  if not results:
    raise inspector.NoReportsFoundError("Farm Credit Administration (reports)")
  for result in results:
    report = report_from(result, REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the archive reports
  doc = utils.beautifulsoup_from_url(REPORT_ARCHIVE_URL)
  results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a")
  if not results:
    raise inspector.NoReportsFoundError("Farm Credit Administration (archive)")
  for result in results:
    if not result.text:
      continue
    report = report_from(result, REPORT_ARCHIVE_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div#mainContent li.mainContenttext a")
  if not results:
    raise inspector.NoReportsFoundError("Farm Credit Administration (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Beispiel #4
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports with pagination
  for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items():
    for page in range(0, 999):
      url = report_url_format.format(page=page)
      doc = BeautifulSoup(utils.download(url))
      results = doc.select("li.views-row")
      if not results:
        if page == 0:
          raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
        else:
          break

      for result in results:
        report = report_from(result, url, report_type, year_range)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports (no pagination)
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("li.views-row")
  if not results:
    raise inspector.NoReportsFoundError("USAID (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Beispiel #5
0
def run(options):
  year_range = inspector.year_range(options, archive)
  results_flag = False

  # Pull the audit reports
  for year in year_range:
    if year < 2002:  # The oldest page for audit reports
      continue
    if year == 2018:
      doc = utils.beautifulsoup_from_url(LATEST_AUDIT_REPORTS_URL)
    else:
      doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL.format(year=year))

    if doc is None:
      # Next year's audit page may not be published yet
      continue

    results = doc.select("div.mainCenter table tr")
    if results:
      results_flag = True
    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = report_from(result, report_type='audit', year_range=year_range)
      if report:
        inspector.save_report(report)

  if not results_flag:
    raise inspector.NoReportsFoundError("NCUA (audit reports)")

  # Pull the other reports
  doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
  results = doc.select("div.mainCenter p")
  if not results:
    raise inspector.NoReportsFoundError("NCUA (other)")
  for result in results:
    report = other_report_from(result, year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div#mainColumns div.mainCenter a")
  if not results:
    raise inspector.NoReportsFoundError("NCUA (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the performance and strategic plans
  doc = utils.beautifulsoup_from_url(PLANS_URL)
  results = doc.select("div.mainCenter p")
  if not results:
    raise inspector.NoReportsFoundError("NCUA (performance/strategic plans)")
  for result in results:
    report = plan_from(result, year_range)
    if report:
      inspector.save_report(report)
Beispiel #6
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for page in range(1, 1000):
    doc = beautifulsoup_from_url("{}?RS={}".format(REPORTS_URL, page))
    results = doc.select("div.leadin")
    if not results:
      if page == 1:
        raise inspector.NoReportsFoundError("VA (audit reports)")
      else:
        break
    for result in results:
      report = report_from(result, year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div.leadin")
  if not results:
    raise inspector.NoReportsFoundError("VA (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Beispiel #7
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for year in year_range:
        if year < 2006:  # The oldest year for audit reports
            continue
        url = AUDIT_REPORTS_URL.format(year=year)
        doc = BeautifulSoup(utils.download(url))
        results = doc.select("div#content li")
        if not results:
            raise inspector.NoReportsFoundError(
                "National Archives and Records Administration audit reports")
        for result in results:
            report = audit_report_from(result, url, year, year_range)
            if report:
                inspector.save_report(report)

    # Pull the semiannual reports
    doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
    results = doc.select("div#content li")
    if not results:
        raise inspector.NoReportsFoundError(
            "National Archives and Records Administration semiannual reports")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)

    # Pull the Peer Review
    doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL))
    result = doc.find("div", id='content').find("a", text=True)
    report = peer_review_from(result, year_range)
    if report:
        inspector.save_report(report)
Beispiel #8
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit and inspections reports
    for report_type, reports_url in REPORT_URLS:
        doc = BeautifulSoup(utils.download(reports_url))
        results = doc.select("div.field-item")
        if not results:
            raise inspector.NoReportsFoundError(
                "National Labor Relations Board (%s)" % report_type)
        for result in results:
            report = report_from(result, report_type, reports_url, year_range)
            if report:
                inspector.save_report(report)

    # Pull the semiannual reports
    doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
    results = doc.select("div.field-item")
    if not results:
        raise inspector.NoReportsFoundError(
            "National Labor Relations Board (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)
Beispiel #9
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    url = AUDITS_REPORTS_URL.format(str(year)[2:4])
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("tr")
    if not results:
      raise inspector.NoReportsFoundError("NASA (%d)" % year)
    for index, result in enumerate(results):
      if not index or not result.text.strip():
        # Skip the header row and any empty rows
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the other reports
  doc = BeautifulSoup(utils.download(OTHER_REPORT_URL))
  results = doc.select("#subContainer ul li")
  if not results:
    raise inspector.NoReportsFoundError("NASA (other)")
  for result in results:
    report = other_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Beispiel #10
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the reports with pagination
    for report_type, report_url_format in PAGINATED_REPORT_FORMATS:
        for page in range(0, 999):
            url = report_url_format.format(page=page)
            doc = utils.beautifulsoup_from_url(url)
            if report_type == "audit" and page == 0 and not doc.select(
                    "div.views-field-field-auditreport-doc-1"):
                raise Exception("Report number CSS class has changed")
            results = doc.select("li.views-row")
            if not results:
                if page == 0:
                    raise inspector.NoReportsFoundError("USAID (%s)" %
                                                        report_type)
                else:
                    break

            for result in results:
                report = report_from(result, url, report_type, year_range)
                if report:
                    inspector.save_report(report)

    # Pull the semiannual reports (no pagination)
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select("li.views-row")
    if not results:
        raise inspector.NoReportsFoundError("USAID (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)
def run(options):
    year_range = inspector.year_range(options, archive)

    doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
    results = doc.article.find_all("tr")
    if not results:
        raise inspector.NoReportsFoundError("FCC (audit reports)")
    for result in results:
        report = report_from(result, AUDIT_REPORTS_URL, year_range)
        if report:
            inspector.save_report(report)

    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.article.find_all("tr")
    if not results:
        raise inspector.NoReportsFoundError("FCC (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL,
                                        year_range)
        if report:
            inspector.save_report(report)

    doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
    results = doc.article.find_all("p")
    if not results:
        raise inspector.NoReportsFoundError("FCC (other)")
    for result in results:
        report = other_report_from(result, OTHER_REPORTS_URL, year_range)
        if report:
            inspector.save_report(report)
Beispiel #12
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    doc = utils.beautifulsoup_from_url(REPORTS_URL)
    results = doc.select("#rounded-corner > tr")
    if not results:
        raise inspector.NoReportsFoundError("Federal Reserve (audit reports)")
    for result in results:
        report = report_from(result, year_range)
        if report:
            inspector.save_report(report)

    # Pull the semiannual reports
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select("div.style-aside ul > li > a")
    if not results:
        raise inspector.NoReportsFoundError(
            "Federal Reserve (semiannual reports)")
    for result in results:
        report_url = urljoin(BASE_PAGE_URL, result.get('href'))
        report = semiannual_report_from(report_url, year_range)
        if report:
            inspector.save_report(report)

    # The most recent semiannual report will be embedded on the main page
    report = semiannual_report_from(SEMIANNUAL_REPORTS_URL, year_range)
    if report:
        inspector.save_report(report)
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
    results = doc.select("table tr")
    if not results:
        raise inspector.NoReportsFoundError(
            "Federal Maritime Commission (audits)")
    for result in results:
        if result.th:
            # Skip the header row
            continue
        report = report_from(result,
                             AUDIT_REPORTS_URL,
                             report_type='audit',
                             year_range=year_range)
        if report:
            inspector.save_report(report)

    # Pull historical audits
    audit_year_links = doc.select("div.col-2-3 ul li a")
    for year_link in audit_year_links:
        audit_year_url = urljoin(AUDIT_REPORTS_URL, year_link.get('href'))
        doc = utils.beautifulsoup_from_url(audit_year_url)
        results = doc.select("table tr")
        if not results:
            # Grab results other than first and last (header and extra links)
            results = doc.select("div.col-2-2 ul")[1:-1]
        if not results:
            raise inspector.NoReportsFoundError(
                "Federal Maritime Commission (%s)" % audit_year_url)
        for result in results:
            if result.th:
                # Skip the header row
                continue
            report = report_from(result,
                                 AUDIT_REPORTS_URL,
                                 report_type='audit',
                                 year_range=year_range)
            if report:
                inspector.save_report(report)

    # Pull the semiannual reports
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select("div.col-2-2 p a") + doc.select("div.col-2-2 li a")
    if not results:
        raise inspector.NoReportsFoundError(
            "Federal Maritime Commission (semiannual reports)")
    for result in results:
        report = report_from(result.parent,
                             AUDIT_REPORTS_URL,
                             report_type='semiannual_report',
                             year_range=year_range)
        if report:
            inspector.save_report(report)
Beispiel #14
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL))
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (audit reports")
  for result in results:
    # ignore divider lines
    if result.select("img"): continue

    report = report_from(result, report_type='audit', year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (semiannual reports)")
  for result in results:
    if not result.text.strip():
      continue
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the case reports
  response = utils.scraper.post(
    url=CASE_REPORTS_URL,
    data=CASE_REPORTS_DATA,
  )
  doc = BeautifulSoup(response.content)
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (case reports)")
  for index, result in enumerate(results):
    if not index or not result.text.strip():  # Skip the header row and empty rows
      continue
    report = case_report_from(result, CASE_REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the testimony
  doc = BeautifulSoup(utils.download(TESTIMONY_REPORTS_URL))
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (testimony)")
  for result in results:
    if not result.text.strip():
      continue
    report = report_from(result, report_type='testimony', year_range=year_range)
    if report:
      inspector.save_report(report)
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for year in year_range:
        if year < 1998:  # The earliest year for audit reports
            continue
        year_url = AUDIT_REPORTS_URL.format(year=year)
        doc = utils.beautifulsoup_from_url(year_url)
        results = doc.select("tr")
        if not results:
            raise inspector.NoReportsFoundError(
                "Pension Benefit Guaranty Corporation (audit reports)")
        for result in results:
            report = report_from(result,
                                 report_type='audit',
                                 year_range=year_range)
            if report:
                inspector.save_report(report)

    # Pull the congressional requests
    doc = utils.beautifulsoup_from_url(CONGRESSIONAL_REQUESTS_URL)
    results = doc.select("tr")
    if not results:
        raise inspector.NoReportsFoundError(
            "Pension Benefit Guaranty Corporation (congressional requests)")
    for result in results:
        report = report_from(result,
                             report_type='congress',
                             year_range=year_range)
        if report:
            inspector.save_report(report)

    # Pull the semiannual reports
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select("div.holder a")
    if not results:
        raise inspector.NoReportsFoundError(
            "Pension Benefit Guaranty Corporation (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)

    # Pull the congressional testimony
    doc = utils.beautifulsoup_from_url(CONGRESSIONAL_TESTIMONY_URL)
    results = doc.select("div.holder a")
    if not results:
        raise inspector.NoReportsFoundError(
            "Pension Benefit Guaranty Corporation (congressional testimony)")
    for result in results:
        report = testimony_report_from(result, year_range)
        if report:
            inspector.save_report(report)
Beispiel #16
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for year in year_range:
        if year < 2002:  # The oldest page for audit reports
            continue
        doc = BeautifulSoup(utils.download(
            AUDIT_REPORTS_URL.format(year=year)))

        # if it's a 404 page (200 response code), move on
        if not_found(doc):
            continue

        results = doc.select("div.content table tr")
        if not results:
            raise inspector.NoReportsFoundError("NCUA (%d)" % year)
        for index, result in enumerate(results):
            if not index:
                # Skip the header row
                continue
            report = report_from(result,
                                 report_type='audit',
                                 year_range=year_range)
            if report:
                inspector.save_report(report)

    # Pull the FOIA reports
    doc = BeautifulSoup(utils.download(FOIA_REPORTS_URL))
    results = doc.select("div.content table tr")
    if not results:
        raise inspector.NoReportsFoundError("NCUA (FOIA)")
    for index, result in enumerate(results):
        if not index:
            # Skip the header row
            continue
        report = report_from(result,
                             report_type='other',
                             year_range=year_range)
        if report:
            inspector.save_report(report)

    # Pull the semiannual reports
    doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
    results = doc.select("div.content a")
    if not results:
        raise inspector.NoReportsFoundError("NCUA (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the reports
    doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
    rows = doc.select("div.content > div > div > div > div > div.row")
    row_audits = rows[0]

    # Audit reports
    results = row_audits.select("ul li.pdf")
    if not results:
        raise inspector.NoReportsFoundError("CPB (audits)")
    for result in results:
        report = report_from(result, AUDIT_REPORTS_URL, "audit", year_range)
        if report:
            inspector.save_report(report)

    doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
    rows = doc.select("div.content > div > div > div > div.row")
    row_peer_review = rows[0]
    col_plans = rows[1].select("div.col-md-6")[0]
    col_congress = rows[1].select("div.col-md-6")[1]

    # Peer review
    results = row_peer_review.select("ul li.pdf")
    if not results:
        raise inspector.NoReportsFoundError("CPB (peer reviews)")
    for result in results:
        report = report_from(result, OTHER_REPORTS_URL, "other", year_range)
        if report:
            inspector.save_report(report)

    # Plans
    results = col_plans.select("ul li.pdf")
    if not results:
        raise inspector.NoReportsFoundError("CPB (plans)")
    for result in results:
        report = report_from(result, OTHER_REPORTS_URL, "other", year_range)
        if report:
            inspector.save_report(report)

    # Semiannual reports to congress
    results = col_congress.select("ul li.pdf")
    if not results:
        raise inspector.NoReportsFoundError("CPB (semiannual reports)")
    for result in results:
        report = report_from(result, OTHER_REPORTS_URL, "semiannual_report",
                             year_range)
        if report:
            inspector.save_report(report)
def run(options):
    year_range = inspector.year_range(options, archive)
    if datetime.datetime.now().month >= 10:
        # October, November, and December fall into the next fiscal year
        # Add next year to year_range to compensate
        year_range.append(max(year_range) + 1)

    # Pull the audit reports
    for year in year_range:
        if year < 2006:  # This is the oldest year for these reports
            continue
        url = AUDIT_REPORTS_BASE_URL.format(year)
        doc = utils.beautifulsoup_from_url(url)
        results = doc.find_all(
            "tr",
            class_=["ms-rteTableOddRow-default", "ms-rteTableEvenRow-default"])
        if not results:
            if year != datetime.datetime.now().year + 1:
                raise inspector.NoReportsFoundError("Treasury (%d)" % year)
        for result in results:
            report = audit_report_from(result, url, year_range)
            if report:
                inspector.save_report(report)

    for report_type, url in OTHER_URLS.items():
        doc = utils.beautifulsoup_from_url(url)
        results = doc.select(
            "#ctl00_PlaceHolderMain_ctl05_ctl01__ControlWrapper_RichHtmlField > p a"
        )
        if not results:
            raise inspector.NoReportsFoundError("Treasury (%s)" % report_type)
        for result in results:
            if len(result.parent.find_all("a")) == 1:
                result = result.parent
            report = report_from(result, url, report_type, year_range)
            if report:
                inspector.save_report(report)

    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select(
        "#ctl00_PlaceHolderMain_ctl05_ctl01__ControlWrapper_RichHtmlField > p > a"
    )
    if not results:
        raise inspector.NoReportsFoundError("Treasury (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL,
                                        year_range)
        if report:
            inspector.save_report(report)
def run(options):
  year_range = inspector.year_range(options, archive)

  urls = [ARCHIVED_REPORTS_URL, PRIOR_PENDING_REPORTS_URL]
  for year in year_range:
    if year >= 2005:
        urls.append(AUDIT_REPORTS_URL.format(year))

  # Pull the audit reports
  for url in urls:
    doc = utils.beautifulsoup_from_url(url)
    results = doc.find("table", border="1").select("tr")
    if not results:
      raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (%d)" % year)
    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the congressional testimony
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  semiannual_reports_table = doc.find("table", border="1")
  results = semiannual_reports_table.select("tr")
  if not results:
    raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (congressional testimony)")
  for index, result in enumerate(results):
    if index < 2:
      # Skip the first two header rows
      continue
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the other reports
  for reports_url, id_prefix in OTHER_REPORT_URLS:
    doc = utils.beautifulsoup_from_url(reports_url)
    results = doc.find("table", border="1").select("tr")
    if not results:
      raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (other)")
    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = other_report_from(result, year_range, id_prefix, reports_url)
      if report:
        inspector.save_report(report)
Beispiel #20
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for year in year_range:
        url = AUDITS_REPORTS_URL.format(year)
        doc = BeautifulSoup(utils.download(url))
        results = doc.find("table", border="1").select("tr")
        if not results:
            raise inspector.NoReportsFoundError(
                "Nuclear Regulatory Commission (%d)" % year)
        for index, result in enumerate(results):
            if not index:
                # Skip the header row
                continue
            report = audit_report_from(result, url, year_range)
            if report:
                inspector.save_report(report)

    # Pull the congressional testimony
    doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
    semiannual_reports_table = doc.find("table", border="1")
    results = semiannual_reports_table.select("tr")
    if not results:
        raise inspector.NoReportsFoundError(
            "Nuclear Regulatory Commission (congressional testimony)")
    for index, result in enumerate(results):
        if index < 2:
            # Skip the first two header rows
            continue
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)

    # Pull the other reports
    for reports_url, id_prefix in OTHER_REPORT_URLS:
        doc = BeautifulSoup(utils.download(reports_url))
        results = doc.find("table", border="1").select("tr")
        if not results:
            raise inspector.NoReportsFoundError(
                "Nuclear Regulatory Commission (other)")
        for index, result in enumerate(results):
            if not index:
                # Skip the header row
                continue
            report = other_report_from(result, year_range, id_prefix)
            if report:
                inspector.save_report(report)
Beispiel #21
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for url, report_type in REPORT_URLS.items():
    page_content = utils.download(url)

    # This typo confuses BS4 and interferes with our selectors
    page_content = page_content.replace('<h4>2015</h3>', '<h4>2015</h4>')

    doc = BeautifulSoup(page_content)

    results = doc.select("blockquote > ul > a")
    if not results:
      results = doc.select("blockquote > ul > li > a")
    if not results:
      results = doc.select("blockquote > font > ul > a")
    if not results:
      results = doc.select("blockquote > a")
    if not results:
      raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % url)
    for result in results:
      report = report_from(result, url, report_type, year_range)
      if report:
        inspector.save_report(report)
Beispiel #22
0
def parse_investigation(content, landing_url, report_type, year_range):
    doj_flag = True
    doj_report_counter = 0
    other_report_counter = 0
    for child in content.children:
        if isinstance(child, Tag) and child.name == 'hr':
            doj_flag = False
            continue
        if doj_flag:
            if isinstance(child, Tag) and child.name == 'ul':
                report = report_from(child.li, landing_url, report_type,
                                     year_range)
                if report:
                    inspector.save_report(report)
                    doj_report_counter = doj_report_counter + 1
        else:
            if isinstance(child, Tag):
                if child.name != 'h3' and child.text.strip():
                    report = report_from(child, landing_url, report_type,
                                         year_range)
                    if report:
                        inspector.save_report(report)
                        other_report_counter = other_report_counter + 1
            elif isinstance(child, Comment):
                continue
            elif isinstance(child, NavigableString):
                if child.strip():
                    raise Exception("Unexpected text!: " + child)
    if doj_report_counter == 0 or other_report_counter == 0:
        raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" %
                                            landing_url)
Beispiel #23
0
def run(options):
  year_range = inspector.year_range(options, archive)

  component = options.get('component')
  if component:
    components = [component]
  else:
    components = list(COMPONENTS.keys())

  report_id = options.get('report_id')

  limit = int(options.get('limit', 0))

  all_audit_reports = {}

  for component in components:
    logging.info("## Fetching reports for component %s" % component)
    url = url_for(options, component)
    body = utils.download(url)

    doc = BeautifulSoup(body)

    results = doc.select("table.contentpaneopen table[border=1] tr")
    # accept only trs that look like body tr's (no 'align' attribute)
    #   note: HTML is very inconsistent. cannot rely on thead or tbody
    results = [x for x in results if x.get('align') is None]
    if not results:
      raise inspector.NoReportsFoundError("DHS (%s)" % component)

    count = 0
    for result in results:
      report = report_from(result, component, url)
      if not report:
        continue

      if report_id and (report_id != report['report_id']):
        continue

      if inspector.year_from(report) not in year_range:
        # logging.info("[%s] Skipping, not in requested range." % report['report_id'])
        continue

      key = (report["report_id"], report["title"])
      if key in all_audit_reports:
        all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \
                ", " + report["agency"]
        all_audit_reports[key]["agency_name"] = \
                all_audit_reports[key]["agency_name"] + ", " + \
                report["agency_name"]
      else:
        all_audit_reports[key] = report

      count += 1
      if limit and (count >= limit):
        break

    logging.info("## Fetched %i reports for component %s\n\n" % (count, component))

  for report in all_audit_reports.values():
    inspector.save_report(report)
Beispiel #24
0
def parse_mapping(content, landing_url, report_type, year_range):
    links = content.find_all("a")
    if not links:
        raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" %
                                            landing_url)
    for link in links:
        href = link.get("href")
        href = urljoin(landing_url, href)
        result = None
        if href == "https://www.oig.lsc.gov/images/mapping/mapping.zip":
            continue
        elif href == MAPPING_PROJECT_ARCHIVE_GRANTEE_URL:
            continue
        elif href.startswith("mailto:"):
            continue
        elif href == "https://www.oig.lsc.gov/evaluation-of-legal-services-mapping-prsentation":
            link["href"] = "https://oig.lsc.gov/mapping/phaseIIbriefing.pdf"
            result = link.parent
        elif href in (
                "https://www.oig.lsc.gov/images/pdfs/mapping/MeekerOIGMappingReport.pdf",
                "https://www.oig.lsc.gov/core-legal-services",
        ):
            result = link.parent
        elif href == "https://www.oig.lsc.gov/images/mapping/Mapping_Evaluation_Phase_I_Volume_I_Final_Report.pdf":
            result = link.parent.parent
        elif (href.startswith("https://oig.lsc.gov/mapping/references/eval")
              and href.endswith(".pdf")):
            result = link
        else:
            raise Exception(
                "Unexpected link found on a mapping project page: %s" % href)

        report = report_from(result, landing_url, report_type, year_range)
        if report:
            inspector.save_report(report)
def run(options):
    year_range = inspector.year_range(options, archive)
    min_year = min(year_range)
    page = 0
    last_page = 0

    while page <= last_page:
        doc = utils.beautifulsoup_from_url(
            REPORT_SEARCH_URL.format(min_year, page))
        last_page_link = doc.find("a", title="Go to last page")
        if last_page_link:
            href = last_page_link["href"]
            page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href)
            if page_match:
                last_page = int(page_match.group(1))

        results = doc.select(".view-reports-advanced-search .views-row")
        if not results:
            raise inspector.NoReportsFoundError("Department of the Interior")
        for result in results:
            report = report_from(result, year_range)
            if report:
                inspector.save_report(report)
        page += 1
    if last_page == 0:
        raise Exception("Did not find last page link")
Beispiel #26
0
def run(options):
  year_range = inspector.year_range(options, archive)

  topics = options.get('topics')
  if topics:
    topics = topics.split(",")
  else:
    topics = TOPIC_TO_URL.keys()

  all_reports = {}

  for topic in topics:
    year_urls = urls_for(year_range, topic)
    for year_url in year_urls:
      logging.debug("Scraping %s" % year_url)
      body = utils.download(year_url)

      doc = BeautifulSoup(body)

      if not doc.select(".view-business-areas"):
        raise inspector.NoReportsFoundError("DOT (%s)" % topic)

      results = doc.select(".view-business-areas .views-row")
      for result in results:
        report = report_from(result, year_range, topic, options)
        if report:
          report_id = report["report_id"]
          if report_id in all_reports:
            all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \
                + ", " + topic
          else:
            all_reports[report_id] = report

  for report in all_reports.values():
    inspector.save_report(report)
Beispiel #27
0
def get_content(url):
    page = utils.download(url)
    page = BeautifulSoup(page)
    content = page.select(".content-left")
    if not content:
        raise inspector.NoReportsFoundError("DOJ (%s)" % url)
    return content
Beispiel #28
0
def extract_reports_for_subtopic(subtopic_url, year_range, topic_name,
                                 subtopic_name):
    doc = beautifulsoup_from_url(subtopic_url)
    if not doc:
        raise Exception("Failure fetching subtopic URL: %s" % subtopic_url)

    results = None

    # This URL is different than the rest and needs to find the "p > a"s first.
    if subtopic_url == TOPIC_TO_URL['TMPC']:
        results = doc.select("#leftContentInterior > p > a")
    if not results:
        results = doc.select("#leftContentInterior dl dd")
    if not results:
        results = doc.select("#leftContentInterior ul li")
    if not results:
        results = doc.select("#leftContentInterior > p > a")
    if not results:
        raise inspector.NoReportsFoundError("HHS (%s)" % subtopic_name)
    for result in results:
        if 'crossref' in result.parent.parent.attrs.get('class', []):
            continue
        if result.parent.parent.attrs.get('id') == 'related':
            continue
        report = report_from(result, year_range, topic_name, subtopic_url,
                             subtopic_name)
        if report:
            deduplicate_save_report(report)
Beispiel #29
0
    def run(self, options):
        self.options = options
        self.year_range = inspector.year_range(self.options, archive)
        self.first_date = datetime.datetime(self.year_range[0], 1, 1)
        self.last_date = datetime.datetime(self.year_range[-1], 12, 31)

        for url in self.urls_for():
            page = utils.beautifulsoup_from_url(url)

            nodes = page.select('.energy-listing__results .node')
            if not nodes:
                nodes = page.select('.field-items .node')
            if not nodes:
                nodes = page.select('.node')
            if not nodes:
                raise inspector.NoReportsFoundError(
                    "Department of Energy (%s)" % url)

            for node in nodes:
                report = self.report_from(node)
                if report:
                    inspector.save_report(report)
                else:
                    # Empty report indicates a report out of the date range, or not the ID.
                    continue
Beispiel #30
0
def run(options):
  year_range = inspector.year_range(options, archive)
  keys = set()

  # Pull the reports
  for report_type, url in REPORT_URLS:
    doc = utils.beautifulsoup_from_url(url)
    results = doc.select("section#content ul li")
    if results:
      for result in results:
        report = report_from_list(result, url, report_type, year_range)
        if report:
          if report["url"]:
            key = (report["report_id"], unquote(report["url"]))
          else:
            key = (report["report_id"], report["url"])
          if key not in keys:
            inspector.save_report(report)
            keys.add(key)
    else:
      results = doc.select("section#content p")
      if not results:
        raise inspector.NoReportsFoundError("Federal Labor Relations Authority (%s)" % report_type)
      for result in results:
        report = report_from_paragraph(result, url, report_type, year_range)
        if report:
          key = (report["report_id"], report["url"])
          if key not in keys:
            inspector.save_report(report)
            keys.add(key)