Ejemplo n.º 1
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the reports
    doc = BeautifulSoup(utils.download(REPORTS_URL))
    semiannual_report_results, other_results = doc.select(
        "table tr")[1].select("td")

    if not semiannual_report_results:
        raise inspector.NoReportsFoundException("EEOC (semiannual reports)")
    if not other_results:
        raise inspector.NoReportsFoundException("EEOC (other reports)")

    merge_items(semiannual_report_results)
    merge_items(other_results)

    for result in semiannual_report_results.select("li"):
        report = semiannual_report_from(result,
                                        year_range,
                                        title_prefix="Semiannual Report - ")
        if report:
            inspector.save_report(report)

    for result in other_results.select("li"):
        report = report_from(result, year_range)
        if report:
            inspector.save_report(report)
Ejemplo n.º 2
0
    def run(self, options):
        self.options = options
        self.year_range = inspector.year_range(self.options, archive)
        self.first_date = datetime.datetime(self.year_range[0], 1, 1)
        self.last_date = datetime.datetime(self.year_range[-1], 12, 31)

        for url in self.urls_for():
            page = BeautifulSoup(utils.download(url))

            nodes = page.select('.energy-listing__results .node')
            if not nodes:
                nodes = page.select('.field-items .node')
            if not nodes:
                nodes = page.select('.node')
            if not nodes:
                raise inspector.NoReportsFoundException(
                    "Department of Energy (%s)" % url)

            for node in nodes:
                report = self.report_from(node)
                if report:
                    inspector.save_report(report)
                else:
                    # Empty report indicates a report out of the date range, or not the ID.
                    continue
Ejemplo n.º 3
0
def run(options):
    year_range = inspector.year_range(options, archive)

    only = options.get('topics')
    if only:
        only = set(only.split(','))
    else:
        only = ALL_TOPIC_AREAS

    index_body = utils.download(BASE_URL)

    current_year = None
    index = BeautifulSoup(index_body)
    tables = index.select('table.style1')
    if not tables:
        raise inspector.NoReportsFoundException("EPA")
    for table in tables:
        trs = table.select('tr')
        for tr in trs:
            tds = tr.select('td')
            if len(tds) < 8:
                if len(tds) == 1:
                    # Large column that indicates year
                    col_links = tds[0].select('a')
                    if len(col_links) == 1:
                        col_text = col_links[0].text
                        if RE_YEAR.match(col_text):
                            current_year = col_text
                continue

            published_on_dt = datetime.datetime.strptime(
                tds[6].text, '%m/%d/%Y')
            if published_on_dt.year not in year_range:
                continue

            topic_areas = set(tds[7].text.split(', '))
            if not len(topic_areas.intersection(only)):
                continue

            report = report_from(tds, published_on_dt, current_year)
            if report:
                inspector.save_report(report)
Ejemplo n.º 4
0
def run(options):
    year_range = inspector.year_range(options, archive)

    published_on = None
    for page_url in [
            WHATS_NEW_URL, WHATS_NEW_ARCHIVE_URL,
            SEMIANNUAL_REPORTS_AND_TESTIMONIES_URL
    ]:
        body = utils.download(page_url)
        doc = BeautifulSoup(body)

        maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
        all_a = maincontent.find_all("a")
        if not all_a:
            raise inspector.NoReportsFoundException("Ex-Im Bank (%s)" %
                                                    page_url)
        for a in all_a:
            a_text = str(a.text)
            if a_text.strip() == "":
                continue

            a_href = a.get("href")
            if a_href.startswith("mailto:"):
                continue
            if a_href.startswith("https://public.govdelivery.com/"):
                continue
            if page_url == WHATS_NEW_URL and a_href == "/oig/whats-new-archive.cfm":
                # end of page
                break

            if deduplicate_url(a_href):
                continue

            # Now, we want to grab all of the text associated with this link.
            # If there is just one link inside of a paragraph tag, we can take the
            # text contents of that paragraph tag. Otherwise, we use "previous" to
            # grab all the text that comes before the link.

            parent_p = a
            while parent_p.name != "p":
                parent_p = parent_p.parent
            links_in_parent = parent_p.find_all("a")
            links_in_parent = [link for link in links_in_parent \
                                      if len(link.text.strip())]
            links_in_parent = set(
                [link.get("href") for link in links_in_parent])
            if len(links_in_parent) == 1:
                all_text = parent_p.text
            else:
                all_text = a_text
                node = a.previous
                while True:
                    if is_inside_link(node):
                        break
                    if isinstance(node, NavigableString):
                        all_text = node + all_text
                    node = node.previous
                    if not node:
                        break
                    if node == maincontent:
                        break

            # Response letters don't get their own date heading -- keep date from
            # last report and reuse in those cases
            temp = DATE_RE.search(all_text)
            if temp:
                # For semiannual reports to congress, use the second date from the text
                # Also, tack the date onto the report_id to disambiguate
                if page_url == SEMIANNUAL_REPORTS_AND_TESTIMONIES_URL and a_text.strip(
                ).startswith('Semiannual Report to Congress'):
                    a_text = a_text.strip() + ' ' + temp.group(0) + ' - '
                    temp = DATE_RE.search(all_text, temp.end() + 1)
                    a_text = a_text + temp.group(0)
                date_text = temp.group(0).replace('Sept ', 'Sep ')
                try:
                    published_on = datetime.strptime(date_text, '%B %d, %Y')
                except ValueError:
                    published_on = datetime.strptime(date_text, '%b %d, %Y')
            if (published_on is None) or (published_on.year not in year_range):
                continue

            report = report_from(all_text, a_text, a_href, page_url,
                                 published_on)
            inspector.save_report(report)

    for page_url in [PRESS_RELEASES_URL, PRESS_RELEASES_ARCHIVE_URL]:
        done = False
        body = utils.download(page_url)
        doc = BeautifulSoup(body)

        maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
        all_p = maincontent.find_all("p")

        for p in all_p:
            for all_text, link_text, link_url in recurse_tree(p, False):
                if link_url == None:
                    continue
                if link_url.startswith("mailto:"):
                    continue
                if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm":
                    # end of page
                    done = True
                    break
                if link_url.startswith("https://public.govdelivery.com/"):
                    continue
                for index_url in URLS:
                    if index_url.find(link_url) != -1:
                        continue

                if deduplicate_url(link_url):
                    continue

                date_match = DATE_RE.search(all_text)
                try:
                    date_text = date_match.group(0).replace('Sept ', 'Sep ')
                    published_on = datetime.strptime(date_text, '%B %d, %Y')
                except ValueError:
                    published_on = datetime.strptime(date_text, '%b %d, %Y')
                if published_on.year not in year_range:
                    continue

                report = report_from(all_text, link_text, link_url, page_url,
                                     published_on)
                inspector.save_report(report)
            if done: break
Ejemplo n.º 5
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # optional: limit to a single report
  report_id = options.get("report_id")

  # Get the audit reports
  pre_1998_audit_flag = False
  for year in year_range:
    if year <= 1998:
      if pre_1998_audit_flag:
        continue
      else:
        pre_1998_audit_flag = True
    url = audit_url_for(year)
    doc = beautifulsoup_from_url(url)
    agency_tables = doc.find_all("table", {"border": 1})
    if not agency_tables:
      raise inspector.NoReportsFoundException("Department of Education (%d audit reports)" % year)
    for agency_table in agency_tables:
      results = agency_table.select("tr")
      for index, result in enumerate(results):
        if not index:
          # First row is the header
          continue
        report = audit_report_from(result, url, year_range)
        if report:
          # optional: filter to a single report
          if report_id and (report_id != report['report_id']):
            continue

          inspector.save_report(report)

  # Get semiannual reports
  doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  table = doc.find("table", {"border": 1})
  for index, result in enumerate(table.select("tr")):
    if index < 2:
      # The first two rows are headers
      continue
    report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL, year_range)
    if report:
      # optional: filter to a single report
      if report_id and (report_id != report['report_id']):
        continue
      inspector.save_report(report)

  # Get other reports
  for report_type, url in OTHER_REPORTS_URL.items():
    doc = beautifulsoup_from_url(url)
    results = doc.select("div.contentText ul li")
    if not results:
      raise inspector.NoReportsFoundException("Department of Education (%s)" % report_type)
    for result in results:
      report = report_from(result, url, report_type, year_range)
      if report:
        # optional: filter to a single report
        if report_id and (report_id != report['report_id']):
          continue

        inspector.save_report(report)