Ejemplo n.º 1
0
def run(options):
  year_range = inspector.year_range(options, archive)
  pages = options.get('pages', ALL_PAGES)

  # Pull the audit reports. Pages are 0-indexed.
  for page in range(0, int(pages) - 1):
    doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(page=page)))
    results = doc.select("span.field-content")
    if not results:
      # No more results, we must have hit the last page
      break

    for result in results:
      report = report_from(result, year_range, report_type='audit')
      if report:
        inspector.save_report(report)

  # Grab the other reports
  for report_type, url in OTHER_REPORT_URLS.items():
    doc = BeautifulSoup(utils.download(url))
    results = doc.select(".views-field")
    if not results:
      results = doc.select(".views-row")
    for result in results:
      report = report_from(result, year_range, report_type)
      if report:
        inspector.save_report(report)
Ejemplo n.º 2
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    if year < 2006:  # The oldest year for audit reports
      continue
    url = AUDIT_REPORTS_URL.format(year=year)
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("div#content li")
    for result in results:
      report = audit_report_from(result, url, year, year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("div#content li")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the Peer Review
  doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL))
  result = doc.find("div", id='content').find("a", text=True)
  report = peer_review_from(result, year_range)
  inspector.save_report(report)
Ejemplo n.º 3
0
  def urls_for_topics(self, topics):
    for topic in topics:
      # Topic might be a tuple for ADDITIONAL_TOPICS (not ones from command
      # line).
      self.report_type = None
      if isinstance(topic, tuple):
        topic, report_type = topic
        self.report_type = report_type

      last_page = False

      url = TOPIC_TO_URL[topic]
      page = BeautifulSoup(utils.download(url))
      page_started = self.is_first_page(page)
      if page_started:
        yield url

      for link in page.select('li.pager-item a'):
        next_url = urljoin(url, link['href'])
        next_page = BeautifulSoup(utils.download(next_url))
        if not page_started:
          page_started = self.is_first_page(next_page)
        if page_started:
          yield next_url
        last_page = self.is_last_page(next_page)
        if last_page:
          break
      if last_page:
        continue
    self.report_type = None  # Clear this out afterwards
Ejemplo n.º 4
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    url = AUDITS_REPORTS_URL.format(str(year)[2:4])
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("tr")
    if not results:
      raise inspector.NoReportsFoundError("NASA (%d)" % year)
    for index, result in enumerate(results):
      if not index or not result.text.strip():
        # Skip the header row and any empty rows
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the other reports
  doc = BeautifulSoup(utils.download(OTHER_REPORT_URL))
  results = doc.select("#subContainer ul li")
  if not results:
    raise inspector.NoReportsFoundError("NASA (other)")
  for result in results:
    report = other_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Ejemplo n.º 5
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # # Pull the RSS feed
  doc = BeautifulSoup(utils.download(RSS_URL))
  results = doc.select("item")
  for result in results:
    report = rss_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # # Pull the recent audit reports.
  doc = BeautifulSoup(utils.download(RECENT_AUDITS_URL))
  results = doc.select("div.block > a")
  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the archive audit reports
  doc = BeautifulSoup(utils.download(AUDIT_ARCHIVE_URL))
  results = doc.select("div.block a")
  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the other reports
  doc = BeautifulSoup(utils.download(OTHER_REPORTS_URl))
  results = doc.select("div.block > a")
  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)
Ejemplo n.º 6
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    if year < 2005:  # This is the earliest audits go back
      continue
    url = AUDIT_REPORTS_URL.format(year=year)
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("div.content")
    if not results:
      raise inspector.NoReportsFoundError("Tennessee Valley Authority (%d)" % year)
    for result in results:
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("report")
  if not results:
    raise inspector.NoReportsFoundError("Tennessee Valley Authority (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Ejemplo n.º 7
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports with pagination
  for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items():
    for page in range(0, 999):
      url = report_url_format.format(page=page)
      doc = BeautifulSoup(utils.download(url))
      results = doc.select("li.views-row")
      if not results:
        if page == 0:
          raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
        else:
          break

      for result in results:
        report = report_from(result, url, report_type, year_range)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports (no pagination)
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("li.views-row")
  if not results:
    raise inspector.NoReportsFoundError("USAID (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Ejemplo n.º 8
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the general reports
  doc = BeautifulSoup(utils.download(REPORTS_URL))
  results = doc.select("div#mainContent li.mainContenttext a")
  for result in results:
    report = report_from(result, REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the archive reports
  doc = BeautifulSoup(utils.download(REPORT_ARCHIVE_URL))
  results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a")
  for result in results:
    if not result.text:
      continue
    report = report_from(result, REPORT_ARCHIVE_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("div#mainContent li.mainContenttext a")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Ejemplo n.º 9
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for year in year_range:
        if year < 2002:  # The oldest page for audit reports
            continue
        doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(year=year)))
        results = doc.select("div.content table tr")
        for index, result in enumerate(results):
            if not index:
                # Skip the header row
                continue
            report = report_from(result, report_type="audit", year_range=year_range)
            if report:
                inspector.save_report(report)

    # Pull the FOIA reports
    doc = BeautifulSoup(utils.download(FOIA_REPORTS_URL))
    results = doc.select("div.content table tr")
    for index, result in enumerate(results):
        if not index:
            # Skip the header row
            continue
        report = report_from(result, report_type="other", year_range=year_range)
        if report:
            inspector.save_report(report)

    # Pull the semiannual reports
    doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
    results = doc.select("div.content a")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)
Ejemplo n.º 10
0
def run(options):
  year_range = inspector.year_range(options)

  # Pull the audit reports
  for year in year_range:
    url = audit_report_url(year)
    if url:
      parse_result_from_js_url(url, "auditreports", year, year_range)
    url = inspection_report_url(year)
    if url:
      parse_result_from_js_url(url, "iereports", year, year_range)

  # Pull the congressional testimony
  doc = BeautifulSoup(utils.download(CONGRESSIONAL_TESTIMONY_REPORTS_URL))
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = congressional_testimony_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Ejemplo n.º 11
0
  def urls_for(self):
    only = self.options.get('topics')
    if only: # if only...
      only = set(only.split(','))
      only = [(o, TOPIC_TO_REPORT_TYPE[o]) if o in TOPIC_TO_REPORT_TYPE else o
              for o in only]
      yield from self.urls_for_topics(only)
      # If there are topics selected, ONLY yield URLs for those.
      return

    # First yield the URLs for the topics that are tangential to the main
    # Calendar Year reports.
    yield from self.urls_for_topics(ADDITIONAL_TOPICS)

    # Not getting reports from specific topics, iterate over all Calendar Year
    # reports.
    page = BeautifulSoup(utils.download(BASE_URL))

    # Iterate over each "Calendar Year XXXX" link
    for li in page.select('.field-items li'):
      md = RE_CALENDAR_YEAR.search(li.text)
      if md:
        cur_year = int(md.group(1))
        if cur_year >= self.year_range[0] and cur_year <= self.year_range[-1]:
          href = li.select('a')[0]['href']
          next_url = urljoin(BASE_URL, href)
          # The first page of reports is yielded.
          yield next_url

          # Next, read all the pagination links for the page and yield those. So
          # far, I haven't seen a page that doesn't have all of the following
          # pages enumerated.
          next_page = BeautifulSoup(utils.download(next_url))
          for link in next_page.select('li.pager-item a'):
            yield urljoin(BASE_URL, link['href'])
Ejemplo n.º 12
0
def run(options):
  year_range = inspector.year_range(options, archive)

  doc = BeautifulSoup(utils.download(REPORTS_URL))

  # Pull the semiannual reports
  semiannul_results = doc.select("#AnnualManagementReports select")[0]
  for result in semiannul_results.select("option"):
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the special reports
  special_report_table = doc.find("table", attrs={"bordercolor": "#808080"})
  for index, result in enumerate(special_report_table.select("tr")):
    if not index:
      # Skip the header row
      continue
    report = report_from(result, REPORTS_URL, report_type='other', year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull the audit reports
  for year in year_range:
    if year < 2001:  # The oldest fiscal year page available
      continue
    year_url = AUDIT_REPORTS_URL.format(year=year)
    doc = BeautifulSoup(utils.download(year_url))
    for index, result in enumerate(doc.select("#main table tr")):
      if not index:
        # Skip the header row
        continue
      report = report_from(result, year_url, report_type='audit', year_range=year_range)
      if report:
        inspector.save_report(report)
Ejemplo n.º 13
0
def extract_reports_for_oei(year_range):
  topic_name = TOPIC_NAMES["OE"]
  topic_url = TOPIC_TO_URL["OE"]
  root_body = utils.download(topic_url)
  root_doc = BeautifulSoup(root_body)

  letter_urls = set()
  for link in root_doc.select("#leftContentInterior li a"):
    absolute_url = urljoin(topic_url, link['href'])
    absolute_url = strip_url_fragment(absolute_url)
    letter_urls.add(absolute_url)

  if not letter_urls:
    raise inspector.NoReportsFoundError("HHS (OEI first pass)")

  all_results_links = {}
  all_results_unreleased = []
  for letter_url in letter_urls:
    letter_body = utils.download(letter_url)
    letter_doc = BeautifulSoup(letter_body)

    results = letter_doc.select("#leftContentInterior ul li")
    if not results:
      raise inspector.NoReportsFoundError("HHS (OEI %s)" % letter_url)
    for result in results:
      if 'crossref' in result.parent.parent.attrs.get('class', []):
        continue
      if result.parent.parent.attrs.get('id') == 'related':
        continue

      node = result
      while node and node.name != "h2":
        node = node.previous
      if node and node.name == "h2":
        subtopic_name = str(node.text)
      else:
        subtopic_name = "(unknown)"

      links = result.findAll("a")
      if len(links) == 0:
        result.extract()
        all_results_unreleased.append([result, subtopic_name])
      else:
        url = links[0].get("href")
        if url not in all_results_links:
          result.extract()
          all_results_links[url] = [result, subtopic_name]
        else:
          existing_result = all_results_links[url][0]
          for temp in result.contents:
            temp.extract()
            existing_result.append(temp)
          all_results_links[url][1] = "%s, %s" % (all_results_links[url][1], subtopic_name)

  subtopic_url = TOPIC_TO_URL["OE"]
  for result, subtopic_name in itertools.chain(all_results_links.values(), all_results_unreleased):
    report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name)
    if report:
      inspector.save_report(report)
Ejemplo n.º 14
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL))
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (audit reports")
  for result in results:
    # ignore divider lines
    if result.select("img"): continue

    report = report_from(result, report_type='audit', year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (semiannual reports)")
  for result in results:
    if not result.text.strip():
      continue
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the case reports
  response = utils.scraper.post(
    url=CASE_REPORTS_URL,
    data=CASE_REPORTS_DATA,
  )
  doc = BeautifulSoup(response.content)
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (case reports)")
  for index, result in enumerate(results):
    if not index or not result.text.strip():  # Skip the header row and empty rows
      continue
    report = case_report_from(result, CASE_REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the testimony
  doc = BeautifulSoup(utils.download(TESTIMONY_REPORTS_URL))
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (testimony)")
  for result in results:
    if not result.text.strip():
      continue
    report = report_from(result, report_type='testimony', year_range=year_range)
    if report:
      inspector.save_report(report)
Ejemplo n.º 15
0
def handle_scanner_args(args, opts) -> Tuple[dict, list]:
    """
    --analytics: file path or URL to a CSV of participating domains.

    This function also handles checking for the existence of the file,
    downloading it succesfully, and reading the file in order to populate the
    list of analytics domains.
    """
    parser = scan_utils.ArgumentParser(prefix_chars="--")
    parser.add_argument("--analytics", nargs=1, required=True)
    parsed, unknown = parser.parse_known_args(args)
    dicted = vars(parsed)
    should_be_single = ["analytics"]
    dicted = scan_utils.make_values_single(dicted, should_be_single)
    resource = dicted.get("analytics")
    if not resource.endswith(".csv"):
        no_csv = "".join([
            "--analytics should be the file path or URL to a CSV of participating",
            " domains and end with .csv, which '%s' does not" % resource
        ])
        logging.error(no_csv)
        raise argparse.ArgumentTypeError(no_csv)
    try:
        parsed_url = urlparse(resource)
    except:
        raise
    if parsed_url.scheme and parsed_url.scheme in ("http", "https"):
        analytics_path = Path(opts["_"]["cache_dir"], "analytics.csv").resolve()
        try:
            utils.download(resource, str(analytics_path))
        except:
            logging.error(utils.format_last_exception())
            no_csv = "--analytics URL %s not downloaded successfully." % resource
            logging.error(no_csv)
            raise argparse.ArgumentTypeError(no_csv)
    else:
        if not os.path.exists(resource):
            no_csv = "--analytics file %s not found." % resource
            logging.error(no_csv)
            raise FileNotFoundError(no_csv)
        else:
            analytics_path = resource

    analytics_domains = utils.load_domains(analytics_path)
    dicted["analytics_domains"] = analytics_domains
    del dicted["analytics"]

    return (dicted, unknown)
Ejemplo n.º 16
0
def run(options):
  year_range = inspector.year_range(options)
  only_id = options.get('report_id')

  print("## Downloading reports from %i to %i" % (year_range[0], year_range[-1]))

  url = url_for()
  body = utils.download(url)

  doc = BeautifulSoup(body)
  results = doc.select("section")

  for result in results:
    try:
      year = int(result.get("title"))
      # check that the fetched year is in the range
      if year not in year_range:
        continue
      print("## Downloading year %i " % year)
    except ValueError:
      continue

    # gets each table entry and sends generates a report from it
    listings = result.div.table.tbody.contents
    for item in listings:
      if type(item) is not bs4.element.Tag:
        continue
      report = report_from(item)

      # can limit it to just one report, for debugging convenience
      if only_id and only_id != report['report_id']:
        continue

      inspector.save_report(report)
Ejemplo n.º 17
0
  def fetch_from_landing_page(self, landing_url):
    """Returns a tuple of (pdf_link, summary_text, is_unreleased)."""
    unreleased = False
    page = BeautifulSoup(utils.download(landing_url))

    summary = None
    field_items = page.select('.field-items')
    if field_items:
      text = [node.strip() for node in field_items[0].findAll(text=True)]
      summary = '\n\n'.join(text).strip()
    if not summary:
      logging.info('\tno summary text found')

    if (summary and (RE_NOT_AVAILABLE.search(summary)
                     or RE_NOT_AVAILABLE_2.search(summary)
                     or RE_NOT_AVAILABLE_3.search(summary)
                     or RE_NOT_AVAILABLE_4.search(summary)
                     or RE_CLASSIFIED.search(summary))):
      unreleased = True

    report_url = None
    pdf_link = page.select('.file a')
    if not pdf_link:
      logging.warn('No pdf link found on page: {0}'.format(landing_url))
    else:
      report_url = pdf_link[0]['href']

    return report_url, summary, unreleased
Ejemplo n.º 18
0
def run(options):
  year_range = inspector.year_range(options, archive)
  report_flag = False

  # Pull the table of reports for each year
  for year in year_range:
    url = url_for_year(year)
    html = utils.download(url, scraper_slug="osc")

    if html is None:
      if year == max(year_range):
        continue
      else:
        raise Exception("Couldn't fetch reports page {}".format(url))

    #  spaces appear as &#160; and \u200b .... fix that now
    html = html.replace('&#160;', ' ').replace('\u200b', ' ').replace('\u00a0', ' ').replace('\r', '').replace('\n', '')
    doc = BeautifulSoup(html, "lxml")

    OUTCOME_CODES = generate_outcome_codes(doc)

    keys_used = []  # a few reports appear multiple times... ignore them the second time if they appear more than once

    results = doc.findAll("table")[1].tbody.findAll('tr')  # no ids on the tables, but it's the second one
    for result in results:
      reports = report_from(result, year, year_range, url, OUTCOME_CODES)
      for report in reports:
        if report['report_id'] not in keys_used:
          inspector.save_report(report)
          keys_used.append(report['report_id'])
          report_flag = True

  if not report_flag:
    raise inspector.NoReportsFoundError("OSC")
Ejemplo n.º 19
0
def run(options):
  year_range = inspector.year_range(options)
  topics = options.get('topics')
  if topics:
    topics = topics.split(",")
  else:
    topics = TOPIC_TO_URL.keys()

  for topic in topics:
    topic_url = TOPIC_TO_URL[topic]
    body = utils.download(topic_url)
    doc = BeautifulSoup(body)

    try:
      year_results = doc.select("#Listing")[0]
      results = [x for x in year_results.select("ul li ul li")]
    except IndexError:
      try:
        all_results = doc.select("#bodyholder")[0]
        results = [x for x in all_results.select("ul li")]
      except IndexError:
        results = doc.select("table ul li")

    # Sometimes multiple reports are listed under the same datetime element.
    # We store which published datetime we saw last so that the next report
    # can use if if we are unable to find another published time.
    last_published_on = None
    for result in results:
      report, last_published_on = report_from(result, topic_url, topic, year_range, last_published_on)
      if report:
        inspector.save_report(report)
Ejemplo n.º 20
0
def run(options):
  year_range = inspector.year_range(options)

  for page_url in URLS:
    done = False
    body = utils.download(page_url)
    doc = BeautifulSoup(body)

    maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
    all_p = maincontent.find_all("p")

    for p in all_p:
      for all_text, link_text, link_url in recurse_tree(p, False):
        if link_url == None:
          continue
        if link_url.startswith("mailto:"):
          continue
        if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm":
          # end of page
          done = True
          break
        if link_url.startswith("https://public.govdelivery.com/"):
          continue
        for index_url in URLS:
          if index_url.find(link_url) != -1:
            continue

        year = DATE_RE.search(all_text).group(3)
        if int(year) not in year_range:
          continue

        report = report_from(all_text, link_text, link_url, page_url)
        inspector.save_report(report)
      if done: break
Ejemplo n.º 21
0
def run(options):
  year_range = inspector.year_range(options, archive)

  component = options.get('component')
  if component:
    components = [component]
  else:
    components = list(COMPONENTS.keys())

  report_id = options.get('report_id')

  limit = int(options.get('limit', 0))

  all_audit_reports = {}

  for component in components:
    logging.info("## Fetching reports for component %s" % component)
    url = url_for(options, component)
    body = utils.download(url)

    doc = BeautifulSoup(body)

    results = doc.select("table.contentpaneopen table[border=1] tr")
    # accept only trs that look like body tr's (no 'align' attribute)
    #   note: HTML is very inconsistent. cannot rely on thead or tbody
    results = [x for x in results if x.get('align') is None]
    if not results:
      raise inspector.NoReportsFoundError("DHS (%s)" % component)

    count = 0
    for result in results:
      report = report_from(result, component, url)
      if not report:
        continue

      if report_id and (report_id != report['report_id']):
        continue

      if inspector.year_from(report) not in year_range:
        # logging.info("[%s] Skipping, not in requested range." % report['report_id'])
        continue

      key = (report["report_id"], report["title"])
      if key in all_audit_reports:
        all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \
                ", " + report["agency"]
        all_audit_reports[key]["agency_name"] = \
                all_audit_reports[key]["agency_name"] + ", " + \
                report["agency_name"]
      else:
        all_audit_reports[key] = report

      count += 1
      if limit and (count >= limit):
        break

    logging.info("## Fetched %i reports for component %s\n\n" % (count, component))

  for report in all_audit_reports.values():
    inspector.save_report(report)
Ejemplo n.º 22
0
def run(options):
  year_range = inspector.year_range(options)

  doc = BeautifulSoup(utils.download(REPORTS_URL))

  # Pull the audit reports
  audit_header = doc.find("a", attrs={"name": 'Audit Reports'})
  audit_list1 = audit_header.find_next("ul").select("li")
  # They have two separate uls for these reports. See note to the IG web team.
  audit_list2 = audit_header.find_next("ul").find_next("ul").select("li")
  results = audit_list1 + audit_list2

  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the inspection reports
  inspections_header = doc.find("a", attrs={"name": 'Inspection Reports'})
  results = inspections_header.find_next("ul").select("li")

  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  semiannual_header = doc.find("a", attrs={"name": 'Semiannual Reports'})
  results = semiannual_header.find_next("ul").select("li")

  for result in results:
    report = report_from(result, year_range, title_prefix="Semiannual Report - ")
    if report:
      inspector.save_report(report)
Ejemplo n.º 23
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Find the number of pages to iterate
    doc = BeautifulSoup(utils.download(REPORTS_URL))
    page_count_text = doc.select("div.AspNet-GridView-Pagination")[0].text
    page_count = int(re.search("Page 1 of (\d+)", page_count_text).groups()[0])

    # Iterate over those pages
    for page in range(1, page_count + 1):
        response = utils.scraper.post(
            REPORTS_URL,
            data={
                "__EVENTTARGET": "ctl00$ctl00$MainContent$NavTreeSubContent$sv$GridViewSummary",
                "__EVENTARGUMENT": "Page${page_number}".format(page_number=page),
            },
            cookies=COOKIES,
        )
        doc = BeautifulSoup(response.content)
        results = doc.select("div.AspNet-GridView table tr")
        if not results:
            break
        for index, result in enumerate(results):
            if not index:
                # Skip the header row
                continue
            report = report_from(result, year_range)
            if report:
                inspector.save_report(report)
Ejemplo n.º 24
0
def run(options):
  year_range = inspector.year_range(options)
  pages = options.get('pages', ALL_PAGES)

  max_page = None
  for page in range(1, (int(pages) + 1)):
    if max_page and (page > max_page):
      print("End of pages!")
      break

    print("## Downloading page %i" % page)
    url = url_for(options, page)
    body = utils.download(url)
    doc = BeautifulSoup(body)
    max_page = last_page_for(doc)

    results = doc.select(".views-row")

    for result in results:
      report = report_from(result)

      # inefficient enforcement of --year arg, USPS doesn't support it server-side
      # TODO: change to published_on.year once it's a datetime
      if inspector.year_from(report) not in year_range:
        print("[%s] Skipping report, not in requested range." % report['report_id'])
        continue

      inspector.save_report(report)
Ejemplo n.º 25
0
def run(options):
  year_range = inspector.year_range(options)
  max_pages = int(options.get('pages', 1))
  for year in year_range:
    page = 1
    done = False
    while not done:
      url = url_for(options, page, year)
      body = utils.download(url)

      doc = BeautifulSoup(body)

      next_page = page + 1
      found_next_page = False
      page_links = doc.select("li.pager-item a.active")
      for page_link in page_links:
        if page_link.text == str(next_page):
          found_next_page = True
          break
      if not found_next_page:
        done = True
      if next_page > max_pages:
        done = True

      results = doc.select("table.views-table > tbody > tr")
      for result in results:
        report = report_from(result)
        inspector.save_report(report)

      page = next_page
      if not done:
        print('Moving to next page (%d)' % page)
Ejemplo n.º 26
0
def run(options):
  year_range = inspector.year_range(options, archive)

  topics = options.get('topics')
  if topics:
    topics = topics.split(",")
  else:
    topics = TOPIC_TO_URL.keys()

  all_reports = {}

  for topic in topics:
    year_urls = urls_for(year_range, topic)
    for year_url in year_urls:
      logging.debug("Scraping %s" % year_url)
      body = utils.download(year_url)

      doc = BeautifulSoup(body)

      if not doc.select(".view-business-areas"):
        raise inspector.NoReportsFoundError("DOT (%s)" % topic)

      results = doc.select(".view-business-areas .views-row")
      for result in results:
        report = report_from(result, year_range, topic, options)
        if report:
          report_id = report["report_id"]
          if report_id in all_reports:
            all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \
                + ", " + topic
          else:
            all_reports[report_id] = report

  for report in all_reports.values():
    inspector.save_report(report)
Ejemplo n.º 27
0
def fetch_from_landing_page(landing_url):
  """Returns a tuple of (pdf_link, summary_text)."""
  add_pdf = False

  body = utils.download(landing_url)
  page = BeautifulSoup(body)
  link = page.find('a', text=RE_PDF_LINK_TEXT, href=RE_PDF_HREF)
  if not link:
    link = page.find('a', text=RE_PDF_CLICK_TEXT, href=RE_PDF_HREF)
  if not link:
    link = page.find('a', text=RE_PDF_SARC_TEXT, href=RE_PDF_HREF)

  # cases where .pdf is left off, ugh, e.g.
  # http://www.dodig.mil/pubs/report_summary.cfm?id=849
  if not link:
    link = page.find('a', text=RE_PDF_LINK_TEXT, href=RE_BACKUP_PDF_HREF)
    add_pdf = True


  href = link['href'].strip() if link else None
  if href and add_pdf:
    href = href + ".pdf"

  summary = None
  text_tr = page.select('tr[valign="top"] td')
  if text_tr:
    text = [node.strip() for node in text_tr[0].findAll(text=True)]
    summary = '\n\n'.join(text)
  if not summary:
    logging.info('\tno summary text found')

  return (href, summary)
Ejemplo n.º 28
0
def extract_reports_for_subtopic(subtopic_url, year_range, topic, subtopic=None):
  if subtopic_url.startswith("http://httphttp://"):
    # See notes to IG's web team
    subtopic_url = subtopic_url.replace("http://http", "")

  body = utils.download(subtopic_url)
  doc = BeautifulSoup(body)
  results = doc.select("#body-row02-col02andcol03 a")

  if not results:
    results = doc.select("#body-row02-col01andcol02andcol03 a")
  if not results and "There are currently no reports in this category" not in doc.text:
    raise AssertionError("No report links found for %s" % subtopic_url)

  topic_name = TOPIC_NAMES[topic]
  # Broadcasting Board of Governors is a fully independent agency
  if topic == 'BBG' or subtopic == 'Broadcasting Board of Governors':
    agency = 'bbg'
  else:
    agency = 'state'

  for result in results:
    report = report_from(result, year_range, agency, topic_name, subtopic)
    if report:
      inspector.save_report(report)
Ejemplo n.º 29
0
def semiannual_report_from(result, year_range):
  link = result.find("a")

  title = link.text

  # Parse the report title. Ex:
  # 'OIG Semiannual Report to the Congress: October 1, 2013 - March 31, 2014 (incl. MCC)'
  published_on_text = title.split("-")[-1].split("–")[-1].split("(")[0].strip()
  published_on_text = published_on_text.replace("September 31", "September 30")  # See note to IG Web team
  published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % title)
    return

  landing_url = urljoin(SEMIANNUAL_REPORTS_URL, link.get('href'))
  landing_page = BeautifulSoup(utils.download(landing_url))

  report_url = landing_page.select("div.filefield-file a")[0].get('href')
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  report = {
    'inspector': "usaid",
    'inspector_url': "https://oig.usaid.gov",
    'agency': "usaid",
    'agency_name': "Agency For International Development",
    'type': 'semiannual_report',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Ejemplo n.º 30
0
def run(options):
  year_range = inspector.year_range(options, archive)
  pages = options.get('pages', ALL_PAGES)

  # default to starting at page 1
  begin = int(options.get('begin', 1))

  max_page = None
  for page in range(begin, (int(pages) + 1)):
    if max_page and (page > max_page):
      logging.debug("End of pages!")
      break

    logging.debug("## Downloading page %i" % page)
    url = url_for(options, page)
    body = utils.download(url)
    doc = BeautifulSoup(body)

    # When the USPS restores their page controls, we can use this again,
    # which saves one network call each time.
    max_page = last_page_for(doc)

    results = doc.select(".views-row")

    for result in results:
      report = report_from(result)

      # inefficient enforcement of --year arg, USPS doesn't support it server-side
      # TODO: change to published_on.year once it's a datetime
      if inspector.year_from(report) not in year_range:
        logging.warn("[%s] Skipping report, not in requested range." % report['report_id'])
        continue

      inspector.save_report(report)
Ejemplo n.º 31
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports
  for report_type, report_url in REPORT_URLS.items():
    doc = BeautifulSoup(utils.download(report_url))
    results =  doc.select("td.mainInner div.ms-WPBody li")

    if not results:
      raise inspector.NoReportsFoundError("SIGTARP (%s)" % report_url)

    for result in results:
      report = report_from(result, report_type, year_range)
      if report:
        inspector.save_report(report)
Ejemplo n.º 32
0
def beautifulsoup_from_url(url):
    body = utils.download(url)
    if body is None: return None

    doc = BeautifulSoup(body)

    # Some of the pages will return meta refreshes
    if doc.find("meta") and doc.find("meta").attrs.get(
            'http-equiv') == 'REFRESH':
        redirect_url = urljoin(
            url,
            doc.find("meta").attrs['content'].split("url=")[1])
        return beautifulsoup_from_url(redirect_url)
    else:
        return doc
Ejemplo n.º 33
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the reports
    doc = BeautifulSoup(utils.download(REPORTS_URL))
    results = doc.find("table", {"cellpadding": "5"}).select("tr")
    if not results:
        raise inspector.NoReportsFoundError("FDIC")
    for index, result in enumerate(results):
        if index < 3 or not result.text.strip():
            # The first three rows are headers
            continue
        report = report_from(result, year_range)
        if report:
            inspector.save_report(report)
Ejemplo n.º 34
0
def parse_result_from_js_url(url, format_slug, year, year_range, report_type):
    """
  Given a link to a javascript file that has report data, add all of the reports
  """

    body = utils.download(url, scraper_slug="tigta")
    # Pulling out javascript array values that look like:
    # arrid[0]=new AR("200720002","Stronger Management Oversight Is Required to Ensure Valuable Systems Modernization Expertise Is Received From the Federally Funded Research and Development Center Contractor","20061020","01",2,0,0,0);
    # Look in https://www.treasury.gov/tigta/oa_auditreports_fy14.js for some more examples.
    results = re.findall('arrid\[\d+\]=new AR\((.*)\);', body)
    for result in results:
        report = report_from(result, format_slug, year, year_range,
                             report_type)
        if report:
            inspector.save_report(report)
Ejemplo n.º 35
0
def get_pagination_urls(page):
    """Find the pagination links on the page and yield them all.

  This method recursively downloads new pages in the case that there are more
  than 10.
  """
    for link in page.select('a'):
        if 'href' not in link:
            continue
        if link['href'].startswith('?') and RE_DIGITS.match(link.text):
            yield BASE_URL + link['href']
        elif link['href'].startswith('/pubs') and RE_NEXT_10.search(link.text):
            new_url = urljoin(BASE_URL, link['href'])
            page = BeautifulSoup(utils.download(new_url))
            for link in get_pagination_urls(page):
                yield link
Ejemplo n.º 36
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the reports
    for report_type, url in REPORT_TYPE_MAP.items():
        doc = BeautifulSoup(utils.download(url))
        results = doc.select("div#content div#contentMain ul li.pdf")
        if not results:
            raise inspector.NoReportsFoundError("CPB (%s)" % url)
        for result in results:
            if not result.find('a'):
                # Skip unlinked PDF's
                continue
            report = report_from(result, url, report_type, year_range)
            if report:
                inspector.save_report(report)
Ejemplo n.º 37
0
def get_subtopic_map(topic_url):
    body = utils.download(topic_url)
    doc = BeautifulSoup(body)

    subtopic_map = {}
    for link in doc.select("#leftContentInterior li a"):
        absolute_url = urljoin(topic_url, link['href'])
        absolute_url = strip_url_fragment(absolute_url)

        # Only add new URLs
        if absolute_url not in subtopic_map.values():
            subtopic_map[link.text] = absolute_url

    if not subtopic_map:
        raise inspector.NoReportsFoundError("OEI (subtopics)")

    return subtopic_map
Ejemplo n.º 38
0
def reports_from_page(url_format, page, report_type, year_range, year=''):
    url = url_format.format(page=page, year=year)
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("td.views-field")
    if not results:
        results = doc.select("div.views-row")
    if not results:
        return False

    for result in results:
        if not result.text.strip():
            # Skip empty rows
            continue
        report = report_from(result, report_type, year_range)
        if report:
            inspector.save_report(report)
    return True
Ejemplo n.º 39
0
def run(options):
    year_range = inspector.year_range(options, archive)

    doc = BeautifulSoup(utils.download(REPORTS_URL))

    results = None
    for section in doc.find_all("section"):
        if section.h4 and section.h4.text.strip() == "Publications":
            results = section.find_all("a")
            break

    if not results:
        raise inspector.NoReportsFoundError("Denali Commission")
    for result in results:
        report = report_from(result, year_range)
        if report:
            inspector.save_report(report)
Ejemplo n.º 40
0
def semiannual_report_from(result, year_range):
    # This will look like "toggleReport('SARC-47-49');" and we want to pull out
    # the SARC-47-49
    report_id_javascript = result.get('onclick')
    report_id = re.search("'(.*)'", report_id_javascript).groups()[0]
    landing_url = "http://oig.pbgc.gov/sarc/{report_id}.html".format(
        report_id=report_id)
    landing_page = BeautifulSoup(utils.download(landing_url))

    title = " ".join(landing_page.select("h3")[0].text.split())
    relative_report_url = landing_page.find(
        "a", text="Read Full Report").get('href')

    # The relative report urls try to go up a level too many. Most browsers seem
    # to just ignore this so we will too.
    relative_report_url = relative_report_url.replace("../", "", 1)
    report_url = urljoin(SEMIANNUAL_REPORTS_URL, relative_report_url)

    # There is probably a way to be a bit smarter about this
    summary = landing_page.text.strip()

    published_on_text = title.rsplit("-")[-1].rsplit("through")[-1].replace(
        ".", "").strip()
    published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % title)
        return

    report = {
        'inspector': "pbgc",
        'inspector_url': "http://oig.pbgc.gov",
        'agency': "pbgc",
        'agency_name': "Pension Benefit Guaranty Corporation",
        'type': 'semiannual_report',
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if summary:
        report['summary'] = summary
    if landing_url:
        report['landing_url'] = landing_url
    return report
Ejemplo n.º 41
0
def extract_from_release_page(landing_url):
    doc = BeautifulSoup(utils.download(landing_url))
    main = doc.select("#main #lefSide")[0]

    url_elem = main.select("div")[2].select("a")
    if url_elem:
        url = urljoin(landing_url, url_elem[0]['href'])
    else:
        url = None

    summary = ""
    for p in main.select("p"):
        summary += p.text + "\n\n"

    # will only be used if the title isn't present on the listing
    title = main.select("h2")[0].text.strip()

    return (url, summary.strip(), title)
Ejemplo n.º 42
0
def run(options):
  year_range = inspector.year_range(options, archive)

  for index in INDEX_URLS:
    report_count = 0
    for year in year_range:
      url = url_for(options, index, year)
      body = utils.download(url)

      doc = BeautifulSoup(body)

      results = doc.select("div.view-content div.views-row")
      for result in results:
        report = report_from(result)
        inspector.save_report(report)
        report_count = report_count + 1

    if report_count == 0:
      raise inspector.NoReportsFoundError("Amtrak (%s)" % index.split("/")[-1])
Ejemplo n.º 43
0
def run(options):
    year_range = inspector.year_range(options, archive)

    last_page = options.get("end")
    start = int(options.get("start", 1))

    # Pull the reports
    for (reports_page, report_type) in REPORTS_URLS:

        page = start
        last_page = options.get("end")  # reset for each area
        while True:
            url = url_for(reports_page, page)
            doc = BeautifulSoup(utils.download(url))

            if last_page is None:
                last_page = last_page_from(doc)

            if report_type == "case":
                results = doc.select("div#main div.grayBox2")
            else:
                results = doc.select("div#main div.whiteBox")

            if results:
                for result in results:
                    report = report_from(result, reports_page, report_type,
                                         year_range)
                    if report:
                        inspector.save_report(report)
            elif report_type != "case":
                raise inspector.NoReportsFoundError("CNCS (%s)" % url)
            # closed cases have broken pagination (p6, 7, 8 missing) so ignore
            else:
                pass

            if int(page) >= int(last_page):
                break
            else:
                page += 1

    # one hardcoded peer review, just always do it
    inspector.save_report(do_peer_review())
Ejemplo n.º 44
0
def run(options):
    year_range = inspector.year_range(options, archive)
    only_id = options.get('report_id')

    logging.info("## Downloading reports from %i to %i" %
                 (year_range[0], year_range[-1]))

    url = url_for()
    body = utils.download(url)

    doc = BeautifulSoup(body)
    results = doc.select("section")
    if not results:
        raise inspector.NoReportsFoundError("OPM")
    for result in results:
        try:
            year = int(result.get("title"))
            # check that the fetched year is in the range
            if year not in year_range:
                continue
            logging.info("## Downloading year %i " % year)
        except ValueError:
            continue

        # gets each table entry and sends generates a report from it
        listings = result.div.table.tbody.contents
        for item in listings:
            if type(item) is not bs4.element.Tag:
                continue
            report = report_from(item)

            if report['report_id'] in BLACKLIST:
                logging.warn(
                    "Skipping downed report: remember to report this and get it fixed!"
                )
                continue

            # can limit it to just one report, for debugging convenience
            if only_id and only_id != report['report_id']:
                continue

            inspector.save_report(report)
Ejemplo n.º 45
0
def run(options):
    year_range = inspector.year_range(options, archive)

    only = options.get('topics')
    if only:
        only = set(only.split(','))
    else:
        only = ALL_TOPIC_AREAS

    index_body = utils.download(BASE_URL)

    current_year = None
    index = BeautifulSoup(index_body)
    tables = index.select('table.style1')
    if not tables:
        raise inspector.NoReportsFoundException("EPA")
    for table in tables:
        trs = table.select('tr')
        for tr in trs:
            tds = tr.select('td')
            if len(tds) < 8:
                if len(tds) == 1:
                    # Large column that indicates year
                    col_links = tds[0].select('a')
                    if len(col_links) == 1:
                        col_text = col_links[0].text
                        if RE_YEAR.match(col_text):
                            current_year = col_text
                continue

            published_on_dt = datetime.datetime.strptime(
                tds[6].text, '%m/%d/%Y')
            if published_on_dt.year not in year_range:
                continue

            topic_areas = set(tds[7].text.split(', '))
            if not len(topic_areas.intersection(only)):
                continue

            report = report_from(tds, published_on_dt, current_year)
            if report:
                inspector.save_report(report)
Ejemplo n.º 46
0
def run(options):
    year_range = inspector.year_range(options, archive)
    topics = options.get('topics')
    if topics:
        topics = topics.split(",")
    else:
        topics = TOPIC_TO_URL.keys()

    for topic in topics:
        topic_url = TOPIC_TO_URL[topic]
        body = utils.download(topic_url)
        doc = BeautifulSoup(body)

        try:
            year_results = doc.select("#Listing")[0]
            results = [x for x in year_results.select("ul li ul li")]
        except IndexError:
            try:
                all_results = doc.select("#bodyholder")[0]
                results = [x for x in all_results.select("ul li")]
            except IndexError:
                results = doc.select("table ul li")
        if not results:
            raise inspector.NoReportsFoundError("SEC (%s)" % topic)

        # Sometimes multiple reports are listed under the same datetime element.
        # We store which published datetime we saw last so that the next report
        # can use if if we are unable to find another published time.
        last_published_on = None
        for result in results:
            report, last_published_on = report_from(result, topic_url, topic,
                                                    year_range,
                                                    last_published_on)
            if report:
                inspector.save_report(report)

    for canned_report in CANNED_REPORTS:
        report_datetime = datetime.datetime.strptime(
            canned_report["published_on"], "%Y-%m-%d")
        if report_datetime.year in year_range:
            add_common_fields(canned_report)
            inspector.save_report(canned_report)
Ejemplo n.º 47
0
def urls_for(options, only):
  year_range = inspector.year_range(options, archive)
  for office in only:
    # there's always a first year, and it defaults to current year
    params = {}
    params['searchdate1'] = '01/01/%s' % year_range[0]
    params['searchdate2'] = '12/31/%s' % year_range[-1] # could be the same year
    params['office'] = OFFICES[office]
    params['sort'] = 'report_number'
    params['order'] = 'desc'

    query_string = urlencode(params)
    url = '{0}?{1}'.format(BASE_URL, query_string)
    yield url

    body = utils.download(url)
    page = BeautifulSoup(body)

    for url in get_pagination_urls(page):
      yield url
Ejemplo n.º 48
0
def run(options):
  only = options.get('topics')
  if only:
    only = set(only.split(','))
  else:
    # Default to all offices, whee!
    only = list(OFFICES.keys())

  for url in urls_for(options, only):
    body = utils.download(url)
    page = BeautifulSoup(body)

    report_table = page.select('table[summary~="reports"]')[0]
    for tr in report_table.select('tr')[1:]:
      tds = tr.select('td')
      if len(tds) == 1:
        # Page has no reports, simply a "No Data" indication for these dates.
        break
      report = report_from(tds, options)
      if report:
        inspector.save_report(report)
Ejemplo n.º 49
0
def run(options):
    year_range = inspector.year_range(options, archive)

    doc = BeautifulSoup(utils.download(REPORTS_URL))

    # Pull the audit reports
    audit_header = doc.find("a", attrs={"name": 'Audit Reports'})
    audit_list1 = audit_header.find_next("ul").select("li")
    # They have two separate uls for these reports. See note to the IG web team.
    audit_list2 = audit_header.find_next("ul").find_next("ul").select("li")
    results = audit_list1 + audit_list2
    if not results:
        raise inspector.NoReportsFoundError("FEC (audit reports)")
    for result in results:
        report = report_from(result, year_range, report_type='audit')
        if report:
            inspector.save_report(report)

    # Pull the inspection reports
    inspections_header = doc.find("a", attrs={"name": 'Inspection Reports'})
    results = inspections_header.find_next("ul").select("li")
    if not results:
        raise inspector.NoReportsFoundError("FEC (inspection reports)")
    for result in results:
        report = report_from(result, year_range, report_type='inspection')
        if report:
            inspector.save_report(report)

    # Pull the semiannual reports
    semiannual_header = doc.find("a", attrs={"name": 'Semiannual Reports'})
    results = semiannual_header.find_next("ul").select("li")
    if not results:
        raise inspector.NoReportsFoundError("FEC (semiannual reports)")
    for result in results:
        report = report_from(result,
                             year_range,
                             report_type='semiannual_report',
                             title_prefix="Semiannual Report - ")
        if report:
            inspector.save_report(report)
Ejemplo n.º 50
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the reports
    for report_type, url in REPORT_URLS.items():
        doc = BeautifulSoup(utils.download(url))
        results = doc.select("div.section1 div.ltext > table tr")
        if not results:
            results = doc.select(
                "td.three-col-layout-middle div.ltext > table tr")
        if not results:
            raise inspector.NoReportsFoundError(
                "Government Publishing Office (%s)" % url)
        for result in results:
            if (not result.text.strip() or result.find("th")
                    or result.find("strong")
                    or result.contents[1].text in HEADER_TITLES):
                # Skip header rows
                continue
            report = report_from(result, url, report_type, year_range)
            if report:
                inspector.save_report(report)
Ejemplo n.º 51
0
def run(options):
  year_range = inspector.year_range(options, archive)

  max_pages = options.get('pages', None)
  if max_pages:
    max_pages = int(max_pages)

  for year in year_range:
    page = 1
    done = False
    while not done:
      url = url_for(options, page, year)
      body = utils.download(url)

      doc = BeautifulSoup(body)

      next_page = page + 1
      found_next_page = False
      page_links = doc.select("li.pager-item a.active")
      for page_link in page_links:
        if page_link.text == str(next_page):
          found_next_page = True
          break
      if not found_next_page:
        done = True
      if max_pages and (next_page > max_pages):
        done = True

      results = doc.select("table.views-table > tbody > tr")
      if not results:
        raise inspector.NoReportsFoundError("Amtrak")
      for result in results:
        report = report_from(result)
        inspector.save_report(report)

      page = next_page
      if not done:
        logging.info('Moving to next page (%d)' % page)
Ejemplo n.º 52
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL))

    headers = doc.select("p.Ptitle1")
    if not headers:
        raise inspector.NoReportsFoundError("ITC")

    for header in headers:
        year = int(header.text.strip())
        results = header.findNextSibling("ul").select("li")

        for result in results:
            if not inspector.sanitize(result.text):
                logging.debug("Skipping empty list item.")
                continue

            report = audit_report_from(year, result, AUDIT_REPORTS_URL,
                                       year_range)
            if report:
                inspector.save_report(report)
Ejemplo n.º 53
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    doc = BeautifulSoup(utils.download(REPORTS_URL))
    results = doc.select("ul.text > ul > li")
    if not results:
        raise inspector.NoReportsFoundError("CFTC audit reports")
    for result in results:
        report = report_from(result, year_range)
        if report:
            inspector.save_report(report)

    # Pull the semiannual reports
    results = doc.select("ul.text td a")
    if not results:
        raise inspector.NoReportsFoundError("CFTC semiannual reports")
    for result in results:
        report = report_from(result,
                             year_range,
                             report_type="semiannual_report")
        if report:
            inspector.save_report(report)
Ejemplo n.º 54
0
def run(options):
    year_range = inspector.year_range(options, archive)

    only_report_id = options.get('report_id')

    # Pull the reports
    for report_type, url in REPORT_URLS.items():
        doc = BeautifulSoup(utils.download(url))
        results = doc.select("div.field-item li")
        if not results:
            results = doc.select("div.field-item tr")
        if not results:
            raise inspector.NoReportsFoundError(
                "National Endowment for the Arts (%s)" % report_type)
        for result in results:
            report = report_from(result, url, report_type, year_range)

            if report:
                # debugging convenience: can limit to single report
                if only_report_id and (report['report_id'] != only_report_id):
                    continue

                inspector.save_report(report)
Ejemplo n.º 55
0
def run(options):
    year_range = inspector.year_range(options, archive)
    report_flag = False

    # Pull the table of reports for each year
    for year in year_range:
        url = url_for_year(year)
        html = utils.download(url, scraper_slug="osc")

        if html is None:
            if year == max(year_range):
                continue
            else:
                raise Exception("Couldn't fetch reports page {}".format(url))

        #  spaces appear as &#160; and \u200b .... fix that now
        html = html.replace('&#160;', ' ').replace('\u200b', ' ').replace(
            '\u00a0', ' ').replace('\r', '').replace('\n', '')
        doc = BeautifulSoup(html, "lxml")

        OUTCOME_CODES = generate_outcome_codes(doc)

        keys_used = [
        ]  # a few reports appear multiple times... ignore them the second time if they appear more than once

        results = doc.findAll("table")[1].tbody.findAll(
            'tr')  # no ids on the tables, but it's the second one
        for result in results:
            reports = report_from(result, year, year_range, url, OUTCOME_CODES)
            for report in reports:
                if report['report_id'] not in keys_used:
                    inspector.save_report(report)
                    keys_used.append(report['report_id'])
                    report_flag = True

    if not report_flag:
        raise inspector.NoReportsFoundError("OSC")
Ejemplo n.º 56
0
def run(options):
    year_range = inspector.year_range(options, archive)
    pages = options.get('pages', ALL_PAGES)

    # default to starting at page 1
    begin = int(options.get('begin', 1))

    max_page = None
    for page in range(begin, (int(pages) + 1)):
        if max_page and (page > max_page):
            logging.debug("End of pages!")
            break

        logging.debug("## Downloading page %i" % page)
        url = url_for(options, page)
        body = utils.download(url)
        doc = BeautifulSoup(body)

        # When the USPS restores their page controls, we can use this again,
        # which saves one network call each time.
        max_page = last_page_for(doc)

        results = doc.select(".views-row")
        if not results:
            raise inspector.NoReportsFoundError("USPS")
        for result in results:
            report = report_from(result)

            # inefficient enforcement of --year arg, USPS doesn't support it server-side
            # TODO: change to published_on.year once it's a datetime
            if inspector.year_from(report) not in year_range:
                logging.warn("[%s] Skipping report, not in requested range." %
                             report['report_id'])
                continue

            inspector.save_report(report)
Ejemplo n.º 57
0
def report_from(result, report_type, year_range):
    landing_page_link = result.find("a")
    title = landing_page_link.text.strip()
    landing_url = urljoin(BASE_REPORT_URL, landing_page_link.get('href'))

    # Sometimes the last report on one page is also the first report on the next
    # page. Here, we skip any duplicate landing pages we've already saved.
    if landing_url in visited_landing_urls:
        return

    # This landing page is a duplicate of another one
    if landing_url == "http://oig.ssa.gov/physical-security-office-disability-" \
          "adjudication-and-reviews-headquarters-building-limited-0":
        return

    published_on_text = result.select(
        "span.date-display-single")[0].text.strip()
    published_on = datetime.datetime.strptime(published_on_text,
                                              '%A, %B %d, %Y')

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % title)
        return

    try:
        report_id = result.select("span.field-data")[0].text.strip()
    except IndexError:
        report_id = landing_url.split("/")[-1]

    # This report has the wrong report number entered
    if landing_url == "http://oig.ssa.gov/audits-and-investigations/" \
          "audit-reports/congressional-response-report-internet-claim-" \
          "applications-0":
        report_id = "A-07-10-20166"

    landing_page = BeautifulSoup(utils.download(landing_url))

    unreleased = False
    if "Limited Distribution" in title:
        unreleased = True
        report_url = None
    else:
        try:
            report_url = result.select("span.file a")[0].get('href')
        except IndexError:
            if not unreleased:
                try:
                    report_url = landing_page.find(
                        "a", attrs={
                            "type": 'application/octet-stream;'
                        }).get('href')
                except AttributeError:
                    unreleased = True
                    report_url = None

    try:
        summary = landing_page.select(
            "div.field-type-text-with-summary")[0].text.strip()
    except IndexError:
        summary = None

    file_type = None
    if report_url:
        _, extension = os.path.splitext(report_url)
        if not extension:
            file_type = 'html'

    visited_landing_urls.add(landing_url)

    report = {
        'inspector': "ssa",
        'inspector_url': "http://oig.ssa.gov",
        'agency': "ssa",
        'agency_name': "Social Security Administration",
        'type': report_type,
        'landing_url': landing_url,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if unreleased:
        report['unreleased'] = unreleased
    if file_type:
        report['file_type'] = file_type
    if summary:
        report['summary'] = summary
    return report
Ejemplo n.º 58
0
def beautifulsoup_from_url(url):
    body = utils.download(url)
    return BeautifulSoup(body)
Ejemplo n.º 59
0
def run(options):
    year_range = inspector.year_range(options, archive)

    published_on = None
    for page_url in [
            WHATS_NEW_URL, WHATS_NEW_ARCHIVE_URL,
            SEMIANNUAL_REPORTS_AND_TESTIMONIES_URL
    ]:
        body = utils.download(page_url)
        doc = BeautifulSoup(body)

        maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
        all_a = maincontent.find_all("a")
        if not all_a:
            raise inspector.NoReportsFoundException("Ex-Im Bank (%s)" %
                                                    page_url)
        for a in all_a:
            a_text = str(a.text)
            if a_text.strip() == "":
                continue

            a_href = a.get("href")
            if a_href.startswith("mailto:"):
                continue
            if a_href.startswith("https://public.govdelivery.com/"):
                continue
            if page_url == WHATS_NEW_URL and a_href == "/oig/whats-new-archive.cfm":
                # end of page
                break

            if deduplicate_url(a_href):
                continue

            # Now, we want to grab all of the text associated with this link.
            # If there is just one link inside of a paragraph tag, we can take the
            # text contents of that paragraph tag. Otherwise, we use "previous" to
            # grab all the text that comes before the link.

            parent_p = a
            while parent_p.name != "p":
                parent_p = parent_p.parent
            links_in_parent = parent_p.find_all("a")
            links_in_parent = [link for link in links_in_parent \
                                      if len(link.text.strip())]
            links_in_parent = set(
                [link.get("href") for link in links_in_parent])
            if len(links_in_parent) == 1:
                all_text = parent_p.text
            else:
                all_text = a_text
                node = a.previous
                while True:
                    if is_inside_link(node):
                        break
                    if isinstance(node, NavigableString):
                        all_text = node + all_text
                    node = node.previous
                    if not node:
                        break
                    if node == maincontent:
                        break

            # Response letters don't get their own date heading -- keep date from
            # last report and reuse in those cases
            temp = DATE_RE.search(all_text)
            if temp:
                # For semiannual reports to congress, use the second date from the text
                # Also, tack the date onto the report_id to disambiguate
                if page_url == SEMIANNUAL_REPORTS_AND_TESTIMONIES_URL and a_text.strip(
                ).startswith('Semiannual Report to Congress'):
                    a_text = a_text.strip() + ' ' + temp.group(0) + ' - '
                    temp = DATE_RE.search(all_text, temp.end() + 1)
                    a_text = a_text + temp.group(0)
                date_text = temp.group(0).replace('Sept ', 'Sep ')
                try:
                    published_on = datetime.strptime(date_text, '%B %d, %Y')
                except ValueError:
                    published_on = datetime.strptime(date_text, '%b %d, %Y')
            if (published_on is None) or (published_on.year not in year_range):
                continue

            report = report_from(all_text, a_text, a_href, page_url,
                                 published_on)
            inspector.save_report(report)

    for page_url in [PRESS_RELEASES_URL, PRESS_RELEASES_ARCHIVE_URL]:
        done = False
        body = utils.download(page_url)
        doc = BeautifulSoup(body)

        maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
        all_p = maincontent.find_all("p")

        for p in all_p:
            for all_text, link_text, link_url in recurse_tree(p, False):
                if link_url == None:
                    continue
                if link_url.startswith("mailto:"):
                    continue
                if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm":
                    # end of page
                    done = True
                    break
                if link_url.startswith("https://public.govdelivery.com/"):
                    continue
                for index_url in URLS:
                    if index_url.find(link_url) != -1:
                        continue

                if deduplicate_url(link_url):
                    continue

                date_match = DATE_RE.search(all_text)
                try:
                    date_text = date_match.group(0).replace('Sept ', 'Sep ')
                    published_on = datetime.strptime(date_text, '%B %d, %Y')
                except ValueError:
                    published_on = datetime.strptime(date_text, '%b %d, %Y')
                if published_on.year not in year_range:
                    continue

                report = report_from(all_text, link_text, link_url, page_url,
                                     published_on)
                inspector.save_report(report)
            if done: break
Ejemplo n.º 60
0
def fetch_from_landing_page(landing_url):
  """Returns a tuple of (pdf_link, summary_text)."""
  add_pdf = False
  skip = False

  body = utils.download(landing_url)
  page = BeautifulSoup(body)

  report_tables = page.select('table[summary~="reports"]')
  # in the rare case that doesn't work, have faith
  if len(report_tables) == 0:
    report_tables = page.select('table')
  table = report_tables[0]
  examine_text = table.text

  maybe_unreleased = False
  if RE_OFFICIAL.search(examine_text) or RE_CLASSIFIED.search(examine_text) or RE_FOIA.search(examine_text) or RE_AFGHANISTAN.search(examine_text) or RE_RESTRICTED.search(examine_text) or RE_INTEL.search(examine_text):
    # 'Official use only' or 'Classified' materials don't have PDFs. Mark the
    # report metadata appropriately.
    maybe_unreleased = True

  # two varieties of normal report link
  link = page.find('a', text=RE_PDF_LINK_TEXT, href=RE_PDF_HREF)
  if not link:
    link = page.find('a', text=RE_PDF_CLICK_TEXT, href=RE_PDF_HREF)

  # Semi annual reports to Congress
  if not link:
    link = page.find('a', text=RE_PDF_SARC_TEXT, href=RE_PDF_HREF)

  # occurs for some multi-part reports, top/body/bottom
  if not link:
    link = page.find('a', text=RE_PDF_BODY_MAYBE, href=RE_PDF_HREF)

  # cases where .pdf is left off, ugh, e.g.
  # http://www.dodig.mil/pubs/report_summary.cfm?id=849
  if not link:
    link = page.find('a', text=RE_PDF_LINK_TEXT, href=RE_BACKUP_PDF_HREF)
    if link:
      add_pdf = True

  # last resort, slow python-based check for tightest requirements
  if not link:
    link = page.find(pdf_test)

  # before accepting *any* PDF, check for skippable offenses
  if not link and (RE_EXTERNALLY_HOSTED.search(table.text) or RE_RESCINDED.search(table.text) or RE_RETRACTED.search(table.text) or RE_UNUSED.search(table.text)):
    skip = True

  # okay, I'll take *any* PDF
  if not link:
    link = table.find(any_pdf_test)

  href = link['href'].strip() if link else None
  if href and add_pdf:
    href = href + ".pdf"

  # some URLs have "/../" in the middle, and the redirects are trouble
  if href:
    href = href.replace("/../", "/")

  summary = None
  text_tr = page.select('tr[valign="top"] td')
  if text_tr:
    text = [node.strip() for node in text_tr[0].findAll(text=True)]
    summary = '\n\n'.join(text)
  if not summary:
    logging.info('\tno summary text found')

  return (href, summary, maybe_unreleased, skip)