Example #1
0
def other_report_from(result, year_range):
    link = result.find("a")
    report_id = inspector.sanitize(
        clean_text("-".join(
            link.text.replace("/", "-").replace("'", "").replace(":",
                                                                 "").split())))
    report_id = re.sub('--*', '-', report_id)
    report_url = urljoin(OTHER_REPORTS_URL, link.get('href'))

    match = OTHER_REPORT_RE.match(inspector.sanitize(clean_text(link.text)))
    title = match.group(1)
    published_on_text = match.group(2)
    published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': "ncua",
        'inspector_url':
        "http://www.ncua.gov/about/Leadership/Pages/page_oig.aspx",
        'agency': "ncua",
        'agency_name': "National Credit Union Administration",
        'type': "other",
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Example #2
0
def other_report_from(result, year_range):
  link = result.find("a")
  report_id = inspector.sanitize(clean_text("-".join(link.text.replace("/", "-").replace("'", "").replace(":", "").split())))
  report_id = re.sub('--*', '-', report_id)
  report_url = urljoin(OTHER_REPORTS_URL, link.get('href'))

  match = OTHER_REPORT_RE.match(inspector.sanitize(clean_text(link.text)))
  title = match.group(1)
  published_on_text = match.group(2)
  published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': "ncua",
    'inspector_url': "http://www.ncua.gov/about/Leadership/Pages/page_oig.aspx",
    'agency': "ncua",
    'agency_name': "National Credit Union Administration",
    'type': "other",
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #3
0
def semiannual_report_from(result, year_range):
  link = result.find("a")
  report_url = link.get('href')
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  title = "Semiannual report - {}".format(link.text.strip())

  link_text = inspector.sanitize(link.text)
  published_on_text = link_text.split("-")[-1].strip().replace(".pdf", "")
  published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'nlrb',
    'inspector_url': "https://www.nlrb.gov/who-we-are/inspector-general",
    'agency': 'nlrb',
    'agency_name': "National Labor Relations Board",
    'type': 'semiannual_report',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #4
0
    def fetch_from_landing_page(self, landing_url):
        """Returns a tuple of (pdf_link, summary_text, is_unreleased)."""
        unreleased = False
        page = BeautifulSoup(utils.download(landing_url))

        summary = None
        field_items = page.select('.field-items')
        if field_items:
            text = [node.strip() for node in field_items[0].findAll(text=True)]
            summary = '\n\n'.join(text).strip()
        if not summary:
            logging.info('\tno summary text found')

        # sanitize now instead of later, to compare to regexes
        else:
            summary = inspector.sanitize(summary)

        if (summary and (RE_NOT_AVAILABLE.search(summary)
                         or RE_NOT_AVAILABLE_2.search(summary)
                         or RE_NOT_AVAILABLE_3.search(summary)
                         or RE_NOT_AVAILABLE_4.search(summary)
                         or RE_WITHDRAWN.search(summary)
                         or RE_CLASSIFIED.search(summary))):
            unreleased = True

        report_url = None
        pdf_link = page.select('.file a')
        if not pdf_link:
            logging.warn('No pdf link found on page: {0}'.format(landing_url))
        else:
            report_url = pdf_link[0]['href']

        return report_url, summary, unreleased
Example #5
0
  def fetch_from_landing_page(self, landing_url):
    """Returns a tuple of (pdf_link, summary_text, is_unreleased)."""
    unreleased = False
    page = utils.beautifulsoup_from_url(landing_url)

    summary = None
    field_items = page.select('.field-items')
    if field_items:
      text = [node.strip() for node in field_items[0].findAll(text=True)]
      summary = '\n\n'.join(text).strip()
    if not summary:
      logging.info('\tno summary text found')

    # sanitize now instead of later, to compare to regexes
    else:
      summary = inspector.sanitize(summary)

    if (summary and (RE_NOT_AVAILABLE.search(summary)
                     or RE_NOT_AVAILABLE_2.search(summary)
                     or RE_NOT_AVAILABLE_3.search(summary)
                     or RE_NOT_AVAILABLE_4.search(summary)
                     or RE_WITHDRAWN.search(summary)
                     or RE_CLASSIFIED.search(summary))):
      unreleased = True

    report_url = None
    pdf_link = page.select('.field-name-field-download-files a')
    if not pdf_link:
      logging.warn('No pdf link found on page: {0}'.format(landing_url))
    else:
      report_url = pdf_link[0]['href']

    return report_url, summary, unreleased
Example #6
0
def semiannual_report_from(result, year_range):
    link = result.find("a")
    report_url = link.get('href')
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)

    title = "Semiannual report - {}".format(link.text.strip())

    link_text = inspector.sanitize(link.text)
    published_on_text = link_text.split("-")[-1].strip().replace(".pdf", "")
    published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'nlrb',
        'inspector_url': "https://www.nlrb.gov/who-we-are/inspector-general",
        'agency': 'nlrb',
        'agency_name': "National Labor Relations Board",
        'type': 'semiannual_report',
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)

  headers = set([a.parent for a in
                 doc.find_all("a", id=re.compile("^[0-9]{4}$"))])
  headers.update(doc.find_all("p", class_="Ptitle1"))
  headers = sorted(headers, key=lambda p: int(p.text.strip()), reverse=True)
  if not headers:
    raise inspector.NoReportsFoundError("ITC")

  for header in headers:
    year = int(header.text.strip())
    results = header.findNextSibling("ul").select("li")

    for result in results:
      if not inspector.sanitize(result.text):
        logging.debug("Skipping empty list item.")
        continue

      report = audit_report_from(year, result, AUDIT_REPORTS_URL, year_range)
      if report:
        inspector.save_report(report)
def parse_year_accordion(content, landing_url, report_type, year_range):
    accordions = content.select("div.accordion-group")
    if not accordions:
        raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" %
                                            landing_url)
    for accordion in accordions:
        heading = accordion.select("div.accordion-heading")[0]
        year_text = inspector.sanitize(heading.text)
        body = accordion.select("div.accordion-body div.accordion-inner")[0]
        if year_text == "FY1995" and body.text.strip() == "FY1995":
            continue
        results = [a for a in body.find_all("a") if a.text.strip()]
        if not results:
            raise inspector.NoReportsFoundError(
                "Legal Services Corporation (%s)" % landing_url)
        for result in results:
            report = report_from(result, landing_url, report_type, year_range)
            if report:
                inspector.save_report(report)
Example #9
0
def parse_year_accordion(content, landing_url, report_type, year_range):
  accordions = content.select("div.accordion-group")
  if not accordions:
    raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" %
                                        landing_url)
  for accordion in accordions:
    heading = accordion.select("div.accordion-heading")[0]
    year_text = inspector.sanitize(heading.text)
    body = accordion.select("div.accordion-body div.accordion-inner")[0]
    if year_text == "FY1995" and body.text.strip() == "FY1995":
      continue
    results = [a for a in body.find_all("a") if a.text.strip()]
    if not results:
      raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" %
                                          landing_url)
    for result in results:
      report = report_from(result, landing_url, report_type, year_range)
      if report:
        inspector.save_report(report)
Example #10
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL))

  headers = doc.select("p.Ptitle1")
  if not headers:
    raise inspector.NoReportsFoundError("ITC")

  for header in headers:
    year = int(header.text.strip())
    results = header.findNextSibling("ul").select("li")

    for result in results:
      if not inspector.sanitize(result.text):
        logging.debug("Skipping empty list item.")
        continue

      report = audit_report_from(year, result, AUDIT_REPORTS_URL, year_range)
      if report:
        inspector.save_report(report)
Example #11
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL))

    headers = doc.select("p.Ptitle1")
    if not headers:
        raise inspector.NoReportsFoundError("ITC")

    for header in headers:
        year = int(header.text.strip())
        results = header.findNextSibling("ul").select("li")

        for result in results:
            if not inspector.sanitize(result.text):
                logging.debug("Skipping empty list item.")
                continue

            report = audit_report_from(year, result, AUDIT_REPORTS_URL,
                                       year_range)
            if report:
                inspector.save_report(report)
Example #12
0
def report_from(result, year_range, topic, subtopic_url, subtopic=None):
  # Ignore links to other subsections
  if result.get('class') and result['class'][0] == 'crossref':
    return

  if result.name == 'a':
    # Sometimes we already have a link
    result_link = result
  else:
    result_link = result.find("a")

  # No link found, this is probably just an extra <li> on the page.
  if result_link is None:
    return

  # If this is just a anchor link on the same page, skip
  if not strip_url_fragment(result_link['href']):
    return

  title = result_link.text
  title = title.replace("\xe2\x80\x93", "-")
  title = inspector.sanitize(title)
  title = re.sub('\s+', ' ', title)
  if title in TITLE_NORMALIZATION:
    title = TITLE_NORMALIZATION[title]

  if title in BLACKLIST_TITLES:
    return

  report_url = urljoin(subtopic_url, result_link['href']).strip()

  if report_url in REPORT_URL_MAPPING:
    report_url = REPORT_URL_MAPPING[report_url]

  # Fix copy-paste error in link
  if (title == "Medicare Compliance Review of Altru Hospital for "
          "2012 and 2013" and
          report_url == "http://oig.hhs.gov/oas/reports/region4/41408036.asp"):
    report_url = "http://oig.hhs.gov/oas/reports/region7/71505070.asp"

  # Ignore reports from other sites
  if BASE_URL not in report_url:
    return

  if report_url in BLACKLIST_REPORT_URLS:
    return

  if report_url in OEI_COMBINED_LANDING_PAGES:
    report_url = OEI_COMBINED_LANDING_PAGES[report_url][title]

  report_filename = report_url.split("/")[-1]
  report_id, extension = os.path.splitext(report_filename)

  if report_filename == "11302505.pdf":
    report_id = report_id + "_early_alert"

  # Try a quick check from the listing page to see if we can bail out based on
  # the year
  try:
    published_on_text = result.find_previous("dt").text.strip()
    published_on = datetime.datetime.strptime(published_on_text, "%m-%d-%Y")
  except (AttributeError, ValueError):
    published_on = None

  if published_on and published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  # This report is listed twice, once with the wrong date
  if published_on and published_on.year == 2012 and published_on.month == 1 and \
          published_on.date == 12 and report_id == "20901002":
    return

  if report_id in REPORT_PUBLISHED_MAPPING:
    published_on = REPORT_PUBLISHED_MAPPING[report_id]
  else:
    # Process reports with landing pages
    if extension.lower() != '.pdf':
      report_url, published_on = report_from_landing_url(report_url)
    else:
      published_on = published_on_from_inline_link(
        result,
        report_filename,
        title,
        report_id,
        report_url,
      )

  if not published_on:
    admin.log_no_date("hhs", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  result = {
    'inspector': 'hhs',
    'inspector_url': 'http://oig.hhs.gov',
    'agency': 'hhs',
    'agency_name': 'Health & Human Services',
    'report_id': report_id,
    'topic': topic.strip(),
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if subtopic:
    result['subtopic'] = subtopic
  return result
Example #13
0
def report_from(result, year_range, topic, subtopic_url, subtopic=None):
    # Ignore links to other subsections
    if result.get("class") and result["class"][0] == "crossref":
        return

    if result.name == "a":
        # Sometimes we already have a link
        result_link = result
    else:
        result_link = result.find("a")

    # No link found, this is probably just an extra <li> on the page.
    if result_link is None:
        return

    # If this is just a anchor link on the same page, skip
    if not strip_url_fragment(result_link["href"]):
        return

    title = result_link.text
    title = title.replace("\xe2\x80\x93", "-")
    title = inspector.sanitize(title)
    title = re.sub("\s+", " ", title)
    if title in TITLE_NORMALIZATION:
        title = TITLE_NORMALIZATION[title]

    if title in BLACKLIST_TITLES:
        return

    report_url = urljoin(subtopic_url, result_link["href"]).strip()

    if report_url in REPORT_URL_MAPPING:
        report_url = REPORT_URL_MAPPING[report_url]

    # Ignore reports from other sites
    if BASE_URL not in report_url:
        return

    if report_url in BLACKLIST_REPORT_URLS:
        return

    if report_url in OEI_COMBINED_LANDING_PAGES:
        report_url = OEI_COMBINED_LANDING_PAGES[report_url][title]

    report_filename = report_url.split("/")[-1]
    report_id, extension = os.path.splitext(report_filename)

    if report_filename == "11302505.pdf":
        report_id = report_id + "_early_alert"

    # Try a quick check from the listing page to see if we can bail out based on
    # the year
    try:
        published_on_text = result.find_previous("dt").text.strip()
        published_on = datetime.datetime.strptime(published_on_text, "%m-%d-%Y")
    except (AttributeError, ValueError):
        published_on = None

    if published_on and published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    # This report is listed twice, once with the wrong date
    if (
        published_on
        and published_on.year == 2012
        and published_on.month == 1
        and published_on.date == 12
        and report_id == "20901002"
    ):
        return

    if report_id in REPORT_PUBLISHED_MAPPING:
        published_on = REPORT_PUBLISHED_MAPPING[report_id]
    else:
        # Process reports with landing pages
        if extension.lower() != ".pdf":
            report_url, published_on = report_from_landing_url(report_url)
        else:
            published_on = published_on_from_inline_link(result, report_filename, title, report_id, report_url)

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    result = {
        "inspector": "hhs",
        "inspector_url": "http://oig.hhs.gov",
        "agency": "hhs",
        "agency_name": "Health & Human Services",
        "report_id": report_id,
        "topic": topic.strip(),
        "url": report_url,
        "title": title,
        "published_on": datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if subtopic:
        result["subtopic"] = subtopic
    return result
Example #14
0
def report_from(result, year_range):
  # walk backwards through the doc to find the header title
  for element in result.previous_elements:
    if element and \
            isinstance(element, Tag) and \
            element.name == "span" and \
            element.has_attr("class") and \
            "collapseomatic" in element["class"]:
      header = element.text.strip().lower()
      break
  else:
    raise Exception("Couldn't find the header for %s" % result)

  if header.startswith("inspection"):
    category = "inspection"
  elif header.startswith("semiannual"):
    category = "semiannual_report"
  else:
    category = "other"

  report_id = os.path.splitext(os.path.basename(result['href']))[0]
  report_url = urljoin(REPORTS_URL, result['href'].strip())
  title = inspector.sanitize(result.text)

  # Each financial/performance report is linked twice, once for the IG's
  # transmittal letter and independent auditor's report, and once for
  # the IG's "Perspective on Management and Performance Challenges."
  # Skip the first one and save the second
  if "IG's Transmittal Letter and Independent Auditor's Report" in title \
          and "(pages" in title:
    return None
  elif title == "Hotline Poster":
    return None

  published_on = REPORT_PUBLISHED_MAPPING.get(title)
  if not published_on:
    published_on = REPORT_PUBLISHED_MAPPING.get(report_id)

  if not published_on:
    date_match = DATE_RE.match(title)
    if date_match:
      published_on = datetime.datetime.strptime(date_match.group(1), "%Y.%m")
      if date_match.lastindex == 2:
        title = date_match.group(2)
      elif header.startswith("semiannual"):
        title = published_on.strftime("Semiannual Report to Congress, %B %Y")
      else:
        raise Exception("No good title for %s" % report_id)

  if not published_on:
    admin.log_no_date("denali", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': "denali",
    'inspector_url': "http://www.oig.denali.gov",
    'agency': "denali",
    'agency_name': "Denali Commission",
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'type': category,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }

  return report
Example #15
0
def report_from(result, year_range, report_type, title_prefix=None):
  report_url = urljoin(REPORTS_URL, result.select("a")[-1].get("href"))

  # Temporary hacks to account for link mistakes
  if report_url == "http://www.fec.gov/fecig/documents/Semi14a_000.pdf":
    report_url = "http://www.fec.gov/fecig/documents/Semi14a.pdf"
  if report_url == "http://www.fec.gov/fecig/documents/ReviewofOutstanding" \
                   "RecommendationsasofJune2014_001.pdf":
    report_url = "http://www.fec.gov/general/documents/ReviewofOutstanding" \
                 "RecommendationsasofJune2014.pdf"

  report_filename = report_url.split("/")[-1]
  report_id, extension = os.path.splitext(report_filename)

  published_on = None
  if report_url.endswith(".pdf"):
    # Inline report
    title = inspector.sanitize(result.contents[0].strip().rstrip("-"))
    title = re.sub("\\s+", " ", title)
    if title.endswith((" 200", " 201")):
      # some years are split up by a <span> tag
      title = title + result.contents[1].text
  else:
    # Some pages have separate landing pages.
    doc = utils.beautifulsoup_from_url(report_url)
    title = doc.select("h3")[1].text.strip()
    try:
      published_on_text = doc.select("h3")[2].text.strip()
    except IndexError:
      published_on_text = doc.select("h3")[1].text.strip()
    published_on_text = published_on_text.replace("Period ending ", "")
    published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

  if title == "November 2016" and report_url == "http://www.fec.gov/fecig/documents/OIGSemiannualReporttoCongress-May2016-FinalPublicDistribution.pdf":
    # Fix copy-paste error
    report_url = "http://www.fec.gov/fecig/documents/OIGFall2016SARFINAL.pdf"
    report_filename = report_url.split("/")[-1]
    report_id, extension = os.path.splitext(report_filename)

  if not published_on:
    if report_id in REPORT_PUBLISHED_MAPPING:
      published_on = REPORT_PUBLISHED_MAPPING[report_id]
  if not published_on:
    try:
      published_on_text = "-".join(re.search('(\w+)\s+(\d{4})', title).groups())
      published_on = datetime.datetime.strptime(published_on_text, '%B-%Y')
    except (ValueError, AttributeError):
      pass

  if title_prefix:
    title = "{}{}".format(title_prefix, title)

  if not published_on:
    admin.log_no_date("fec", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': "fec",
    'inspector_url': "http://www.fec.gov/fecig/fecig.shtml",
    'agency': "fec",
    'agency_name': "Federal Election Commission",
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),  # Date of publication
  }
  return report
def audit_report_from(result, landing_url, year, year_range):
    if not result.text.strip():
        return
    link = result.find("a")

    report_url = urljoin(landing_url, link['href'])
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)

    try:
        title = result.select("blockquote")[0].contents[0]
    except IndexError:
        title = result.text

    title_prefixer = re.compile(
        "(Advisory|Management|Audit)\\s*(Letter|Report)\\s*[\\d\\-]+:\\s*",
        re.I)
    title = title_prefixer.sub("", title)

    estimated_date = False
    published_on = None

    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]

    cleaned_text = re.sub("\s+", " ", inspector.sanitize(result.text))
    if not published_on:
        try:
            published_on_text = re.search('(\w+ \d+, \d+)',
                                          cleaned_text).groups()[0]
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B %d, %Y')
        except AttributeError:
            pass

    if not published_on:
        try:
            published_on_text = re.search('(\w+ \d+ , \d+)',
                                          cleaned_text).groups()[0]
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B %d , %Y')
        except AttributeError:
            pass

    if not published_on:
        try:
            response = utils.scraper.request(method="HEAD", url=report_url)
            last_modified = response.headers["Last-Modified"]
            published_on = datetime.datetime.strptime(
                last_modified, "%a, %d %b %Y %H:%M:%S %Z")
        except ValueError:
            pass

    if not published_on:
        admin.log_no_date("archives", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'archives',
        'inspector_url': 'https://www.archives.gov/oig/',
        'agency': 'archives',
        'agency_name': 'National Archives and Records Administration',
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'type': 'audit',
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if estimated_date:
        report['estimated_date'] = estimated_date
    return report
Example #17
0
def report_from(result, report_type, year_range):
  tds = result.select("td")
  if len(tds) > 0:
    title = inspector.sanitize(tds[0].text)
  else:
    return

  if (not title) or (title in HEADER_ROW_TEXT):
    # Skip the header rows
    return

  published_on_text = tds[2].text
  try:
    published_on = datetime.datetime.strptime(published_on_text, '%m/%d/%Y')
  except ValueError:
    published_on = datetime.datetime.strptime(published_on_text, '%m/%Y')

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % title)
    return

  unreleased = False
  link = result.find("a")
  landing_url = urljoin(BASE_REPORT_URL, link.get('href'))
  if landing_url.endswith(".pdf"):
    # Inline report
    report_url = landing_url
    landing_url = None
    summary = None
  else:
    landing_page = utils.beautifulsoup_from_url(landing_url)
    summary = " ".join(landing_page.select("div.holder")[0].text.split())
    report_link = landing_page.find("a", href=PDF_REGEX)
    if report_link:
      report_url = urljoin(landing_url, report_link.get('href'))
    else:
      unreleased = True
      report_url = None

  report_id = tds[1].text.strip().replace("/", "-").replace(" ", "-")
  if report_id == "N-A":
    report_id = tds[0].text.strip().replace("/", "-").replace(" ", "-")
  if report_id == "":
    if report_url:
      report_id = os.path.splitext(os.path.basename(report_url))[0]
    else:
      report_id = os.path.splitext(os.path.basename(landing_url))[0]

  if report_url:
    # OIG MAR-2012-10/PA-12-87 is posted under both Audits/Evaluations/MARs and
    # Congressional Requests.
    if report_url in saved_report_urls:
      return
    saved_report_urls.add(report_url)

  report = {
    'inspector': "pbgc",
    'inspector_url': "http://oig.pbgc.gov",
    'agency': "pbgc",
    'agency_name': "Pension Benefit Guaranty Corporation",
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if summary:
    report['summary'] = summary
  if unreleased:
    report['unreleased'] = unreleased
  if landing_url:
    report['landing_url'] = landing_url
  return report
Example #18
0
def report_from(result, base_url):
  report = {
    'inspector': 'gsa',
    'inspector_url': 'https://www.gsaig.gov/',
    'agency': 'gsa',
    'agency_name': 'General Services Administration'
  }

  title_h4 = result.find("div", property="dc:title").h4
  title = inspector.sanitize(title_h4.text)
  if title_h4.a:
    report['landing_url'] = urljoin(base_url, title_h4.a["href"])
  else:
    report['landing_url'] = base_url

  description = result.find("div", class_="field-name-field-description")
  if description:
    report['summary'] = inspector.sanitize(description.text)

  unreleased = False
  url = None
  file_section = result.find("span", class_="file")
  if file_section:
    file_links = file_section.find_all("a")
    if len(file_links) > 1:
      raise Exception("Multiple file links for %s" % title)
    link = file_links[0]
    url = link.get('href')
    url = urljoin(base_url, url)

    if url == "https://www.gsaig.gov/sites/default/files/recovery-reports/FINAL%20TESTIMONY%20FOR%20APRIL%2021.pdf":
      # This testimony is also posted in the testimony section, so we can skip
      # the one posted under recovery reports
      return

    report_id = os.path.splitext(os.path.basename(unquote_plus(url)))[0]
    report_id = re.sub('[-/\\s]+', '-', inspector.sanitize(report_id))
  else:
    unreleased = report['unreleased'] = True
    report_id = re.sub('[-/\\s]+', '-', inspector.sanitize(title))

  published_date_div = result.find("div", class_="field-name-post-date")
  if published_date_div:
    published_date = published_date_div.text
    date = datetime.strptime(published_date, "%B %d, %Y")
  else:
    # get last match
    match = None
    for match in DATE_RE.finditer(title):
      pass
    published_date = match.group(0)
    date = datetime.strptime(published_date, "%B %d, %Y")

  report_type = type_for(base_url)

  report['type'] = report_type
  report['published_on'] = datetime.strftime(date, "%Y-%m-%d")
  if not unreleased:
    report['url'] = url
    if url.lower().endswith(".pdf"):
      report['file_type'] = "pdf"
    elif url.lower().endswith(".doc"):
      report['file_type'] = "doc"
    elif url.lower().endswith(".xls"):
      report['file_type'] = "xls"
    elif url.lower().endswith(".ppt"):
      report['file_type'] = "ppt"
    else:
      raise Exception("Unexpected filetype for %s" % url)
  report['report_id'] = report_id
  report['title'] = title.strip()

  return report
Example #19
0
def report_from(result, landing_url, report_type, year_range):
  if not result.text or result.text in BLACKLIST_REPORT_TITLES:
    # There are a few empty links due to bad html and some links for alternative
    # formats (PDF) that we will just ignore.
    return

  link_text = None
  if result.name == 'a':
    report_url = result.get('href')
    link_text = inspector.sanitize(result.text)
    title = inspector.sanitize("%s %s" % (result.text, result.next_sibling))
  else:
    links = [link for link in result.find_all('a') if link.text.strip()]
    report_url = links[0].get('href')
    link_text = inspector.sanitize(result.a.text)
    title = inspector.sanitize(result.text)
  report_url = urljoin(landing_url, report_url)
  report_filename = os.path.basename(report_url)

  if title.endswith("PDF"):
    title = title[:-3]
  title = title.rstrip(" .")

  prev = result.previous_sibling
  if isinstance(prev, NavigableString) and "See, also:" in prev:
    return None

  report_no_match = REPORT_NO_RE.match(link_text)
  if report_no_match:
    report_id = report_no_match.group(0)
    if "fraud" in report_url.lower():
      report_id = "fraud-alert-" + report_id
    elif "Client_Trust_Fund" in report_url:
      report_id = "CTF-" + report_id
    elif report_filename.startswith("sr"):
      report_id = "special-report-" + report_id
  else:
    report_id, _ = os.path.splitext(report_filename)
    report_id = unquote(report_id)
  report_id = "-".join(report_id.split())
  report_id = report_id.replace("\\", "")  # strip backslashes

  estimated_date = False
  published_on = None
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]
  elif link_text == "June 2015":
    published_on = datetime.datetime(2015, 6, 1)
  else:
    published_on_text = None
    try:
      published_on_text = re.search('(\d+/\d+/\d+)', title).groups()[0]
    except AttributeError:
      pass
    if not published_on_text:
      try:
        published_on_text = re.search('(\w+ \d+, \d+)', title).groups()[0]
      except AttributeError:
        pass
    if not published_on_text:
      try:
        published_on_text = re.search('(\d+/\d+)', title).groups()[0]
      except AttributeError:
        pass

    if not published_on_text:
      admin.log_no_date("lsc", report_id, title, report_url)
      return

    if not published_on:
      datetime_formats = [
        '%B %d, %Y',
        '%m/%d/%Y',
        '%m/%d/%y',
        '%m/%Y',
        '%m/%y'
      ]
      for datetime_format in datetime_formats:
        try:
          published_on = datetime.datetime.strptime(published_on_text, datetime_format)
        except ValueError:
          pass
        else:
          break

  if not published_on:
    admin.log_no_date("lsc", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'lsc',
    'inspector_url': 'https://www.oig.lsc.gov',
    'agency': 'lsc',
    'agency_name': 'Legal Services Corporation',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }

  if estimated_date:
    report['estimated_date'] = estimated_date

  if report_url in ("https://www.oig.lsc.gov/core-legal-services"):
    report['file_type'] = "html"

  if report_url.startswith("https://oig.lsc.gov/mapping/references/eval"):
    report['unreleased'] = True
    report['missing'] = True

  return report
Example #20
0
def report_from_list(result, landing_url, report_type, year_range):
  missing = False
  title = re.sub("\\s+", " ", inspector.sanitize(result.text))

  report_id = None
  report_id_match = REPORT_ID_RE_1.search(title)
  if report_id_match:
    report_id = report_id_match.group(1) or report_id_match.group(2)

  if 'Non-Public Report' in title:
    unreleased = True
    report_url = None
    if report_id in ("ER-11-01", "ER-12-01", "ER-13-01", "ER-14-01",
                     "ER-15-01", "ER-16-01", "ER-17-01"):
      # These reports are listed in two places, once with a PDF, once without
      return
    if not report_id:
      report_id = "-".join(title.split())
      report_id = report_id.replace(":", "")
  else:
    unreleased = False
    link = result.find("a")
    if not link:
      return None
    # Some reports have incorrect relative paths
    relative_report_url = link.get('href').replace("../", "")
    report_url = urljoin(landing_url, relative_report_url)
    if report_url == "https://www.flra.gov/system/files/webfm/Inspector%20General/FLRA%20IPERA%20Compliance%202011.pdf" and report_id == "ER-12-02":
      report_url = "https://www.flra.gov/system/files/webfm/Inspector%20General/IPERA%20March%202012.pdf"
    if not report_id:
      report_filename = report_url.split("/")[-1]
      report_id, _ = os.path.splitext(report_filename)
      report_id = "-".join(unquote(report_id).split())

  estimated_date = False
  published_on = None
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]
  if not published_on:
    try:
      published_on = datetime.datetime.strptime(title, '%B %Y')
    except ValueError:
      pass

  if not published_on:
    admin.log_no_date("flra", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  if published_on.year <= 2011 and not unreleased and not report_url:
    # Some older reports aren't posted
    unreleased = True
    missing = True

  report = {
    'inspector': 'flra',
    'inspector_url': 'https://www.flra.gov/components-offices/offices/office-inspector-general',
    'agency': 'flra',
    'agency_name': 'Federal Labor Relations Authority',
    'file_type': 'pdf',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if estimated_date:
    report['estimated_date'] = estimated_date
  if unreleased:
    report['unreleased'] = unreleased
    report['landing_url'] = landing_url
  if missing:
    report['missing'] = missing
  return report
Example #21
0
def clean_text(text):
  return re.sub("[ \n]+", " ", inspector.sanitize(text))
Example #22
0
def audit_report_from(result, landing_url, year, year_range):
  link = result.find("a")

  report_url = urljoin(landing_url, link.get('href'))
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  try:
    title = result.select("blockquote")[0].contents[0]
  except IndexError:
    title = result.text

  title_prefixer = re.compile("(Advisory|Management|Audit)\\s*(Letter|Report)\\s*[\\d\\-]+:\\s*", re.I)
  title = title_prefixer.sub("", title)

  estimated_date = False
  published_on = None

  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]

  cleaned_text = re.sub("\s+", " ", inspector.sanitize(result.text))
  if not published_on:
    try:
      published_on_text = re.search('(\w+ \d+, \d+)', cleaned_text).groups()[0]
      published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')
    except AttributeError:
      pass

  if not published_on:
    try:
      published_on_text = re.search('(\w+ \d+ , \d+)', cleaned_text).groups()[0]
      published_on = datetime.datetime.strptime(published_on_text, '%B %d , %Y')
    except AttributeError:
      pass

  if not published_on:
    try:
      response = utils.scraper.request(method="HEAD", url=report_url)
      last_modified = response.headers["Last-Modified"]
      published_on = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
    except ValueError:
      pass

  if not published_on:
    admin.log_no_date("archives", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'archives',
    'inspector_url': 'https://www.archives.gov/oig/',
    'agency': 'archives',
    'agency_name': 'National Archives and Records Administration',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'type': 'audit',
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if estimated_date:
    report['estimated_date'] = estimated_date
  return report
Example #23
0
def report_from(result, landing_url, report_type, year_range, year=None):
    if not result.text or result.text in BLACKLIST_REPORT_TITLES:
        # There are a few empty links due to bad html and some links for alternative
        # formats (PDF) that we will just ignore.
        return

    link_text = None
    if result.name == 'a':
        report_url = result.get('href')
        link_text = inspector.sanitize(result.text)
        title = inspector.sanitize("%s %s" %
                                   (result.text, result.next_sibling))
    else:
        links = [link for link in result.find_all('a') if link.text.strip()]
        report_url = links[0].get('href')
        link_text = inspector.sanitize(result.a.text)
        title = inspector.sanitize(result.text)
    report_url = urljoin(landing_url, report_url)
    report_filename = os.path.basename(report_url)

    prev = result.previous_sibling
    if isinstance(prev, NavigableString) and "See, also:" in prev:
        return None

    report_no_match = REPORT_NO_RE.match(link_text)
    if report_no_match:
        report_id = report_no_match.group(0)
        if "fraud" in report_url.lower():
            report_id = "fraud-alert-" + report_id
        elif "Client_Trust_Fund" in report_url:
            report_id = "CTF-" + report_id
        elif report_filename.startswith("sr"):
            report_id = "special-report-" + report_id
    else:
        report_id, _ = os.path.splitext(report_filename)
        report_id = unquote(report_id)
    report_id = "-".join(report_id.split())
    report_id = report_id.replace("\\", "")  # strip backslashes

    estimated_date = False
    published_on = None
    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]
    elif link_text == "June 2015":
        published_on = datetime.datetime(2015, 6, 1)
    else:
        try:
            published_on_text = re.search('(\d+/\d+/\d+)', title).groups()[0]
        except AttributeError:
            try:
                published_on_text = re.search('(\w+ \d+, \d+)',
                                              title).groups()[0]
            except AttributeError:
                try:
                    published_on_text = re.search('(\d+/\d+)',
                                                  title).groups()[0]
                except AttributeError:
                    if year is None:
                        raise Exception(
                            "No date or year was detected for %s (%s)" %
                            (report_id, title))
                    # Since we only have the year, set this to Nov 1st of that year
                    published_on = datetime.datetime(year, 11, 1)
                    estimated_date = True

        if not published_on:
            datetime_formats = [
                '%B %d, %Y', '%m/%d/%Y', '%m/%d/%y', '%m/%Y', '%m/%y'
            ]
            for datetime_format in datetime_formats:
                try:
                    published_on = datetime.datetime.strptime(
                        published_on_text, datetime_format)
                except ValueError:
                    pass
                else:
                    break

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'lsc',
        'inspector_url': 'https://www.oig.lsc.gov',
        'agency': 'lsc',
        'agency_name': 'Legal Services Corporation',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }

    if estimated_date:
        report['estimated_date'] = estimated_date

    if report_url in ("https://www.oig.lsc.gov/core-legal-services"):
        report['file_type'] = "html"

    return report
Example #24
0
def report_from(result, year_range):
    # walk backwards through the doc to find the header title
    for element in result.previous_elements:
        if element and \
            isinstance(element, Tag) and \
            element.name == "span" and \
            element.has_attr("class") and \
            "collapseomatic" in element["class"]:
            header = element.text.strip().lower()
            break
    else:
        raise Exception("Couldn't find the header for %s" % result)

    if header.startswith("inspection"):
        category = "inspection"
    elif header.startswith("semiannual"):
        category = "semiannual_report"
    else:
        category = "other"

    report_id = os.path.splitext(os.path.basename(result['href']))[0]
    report_url = urljoin(REPORTS_URL, result['href'])
    title = inspector.sanitize(result.text)

    # Each financial/performance report is linked twice, once for the IG's
    # transmittal letter and independent auditor's report, and once for
    # the IG's "Perspective on Management and Performance Challenges."
    # Skip the first one and save the second
    if "IG's Transmittal Letter and Independent Auditor's Report" in title \
        and "(pages" in title:
        return None
    elif title == "Hotline Poster":
        return None

    published_on = REPORT_PUBLISHED_MAPPING.get(title)
    if not published_on:
        published_on = REPORT_PUBLISHED_MAPPING.get(report_id)

    if not published_on:
        date_match = DATE_RE.match(title)
        if date_match:
            published_on = datetime.datetime.strptime(date_match.group(1),
                                                      "%Y.%m")
            if date_match.lastindex == 2:
                title = date_match.group(2)
            elif header.startswith("semiannual"):
                title = published_on.strftime(
                    "Semiannual Report to Congress, %B %Y")
            else:
                raise Exception("No good title for %s" % report_id)

    if not published_on:
        raise Exception("Couldn't find date: %s, %s" % (title, report_id))

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': "denali",
        'inspector_url': "http://www.oig.denali.gov",
        'agency': "denali",
        'agency_name': "Denali Commission",
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'type': category,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }

    return report
def report_from(result, report_type, year_range):
    tds = result.select("td")
    if len(tds) > 0:
        title = inspector.sanitize(tds[0].text)
    else:
        return

    if (not title) or (title in HEADER_ROW_TEXT):
        # Skip the header rows
        return

    published_on_text = tds[2].text
    try:
        published_on = datetime.datetime.strptime(published_on_text,
                                                  '%m/%d/%Y')
    except ValueError:
        published_on = datetime.datetime.strptime(published_on_text, '%m/%Y')

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % title)
        return

    unreleased = False
    link = result.find("a")
    landing_url = urljoin(BASE_REPORT_URL, link.get('href'))
    if landing_url.endswith(".pdf"):
        # Inline report
        report_url = landing_url
        landing_url = None
        summary = None
    else:
        landing_page = utils.beautifulsoup_from_url(landing_url)
        summary = " ".join(landing_page.select("div.holder")[0].text.split())
        report_link = landing_page.find("a", href=PDF_REGEX)
        if report_link:
            report_url = urljoin(landing_url, report_link.get('href'))
        else:
            unreleased = True
            report_url = None

    report_id = tds[1].text.strip().replace("/", "-").replace(" ", "-")
    if report_id == "N-A":
        report_id = tds[0].text.strip().replace("/", "-").replace(" ", "-")
    if report_id == "":
        if report_url:
            report_id = os.path.splitext(os.path.basename(report_url))[0]
        else:
            report_id = os.path.splitext(os.path.basename(landing_url))[0]

    if report_url:
        # OIG MAR-2012-10/PA-12-87 is posted under both Audits/Evaluations/MARs and
        # Congressional Requests.
        if report_url in saved_report_urls:
            return
        saved_report_urls.add(report_url)

    report = {
        'inspector': "pbgc",
        'inspector_url': "http://oig.pbgc.gov",
        'agency': "pbgc",
        'agency_name': "Pension Benefit Guaranty Corporation",
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if summary:
        report['summary'] = summary
    if unreleased:
        report['unreleased'] = unreleased
    if landing_url:
        report['landing_url'] = landing_url
    return report
def report_from(result, reports_page, report_type, year_range):
    unreleased = False
    summary = None
    landing_url = None
    estimated_date = False

    # audits have some data, but link to landing page for summary and URL
    if report_type == "audit":
        landing_a = result.select(".cell3 a")[0]
        landing_url = urljoin(reports_page, landing_a['href'])
        long_title = landing_a.text.strip()

        # https://www.cncsoig.gov/news-entry/97-09 and
        # https://www.cncsoig.gov/news-entry/97-09-0 are duplicates of each other
        if landing_url == "https://www.cncsoig.gov/news-entry/97-09-0":
            return

        # PDF URL and summary are on the report's landing page
        report_url, summary, title = extract_from_release_page(landing_url)
        if not report_url:
            unreleased = True

        if not title:
            title = long_title

        # the report PDF URL can be pulled from the comments
        # we're ignoring this since we're going to the landing page anyhow.
        # re.search("href=\"(/sites/default/files/.*?)\">GO", str(result))

        report_id = result.select(".cell1")[0].text.strip()
        stamp = result.select(".cell2")[0].text.strip()
        published_on = datetime.datetime.strptime(stamp, "%m.%d.%Y")

    elif report_type == "investigation":
        stamp = result.select(".cell2")[0].text.strip()
        published_on = datetime.datetime.strptime(stamp, "%Y-%m-%d")
        title = result.select(".cell3 p")[0].text.strip()

        report_url = result.select(".cell3 a")[0]['href']
        report_url = urljoin(reports_page, report_url)
        report_id = os.path.splitext(report_url.split("/")[-1])[0]

    elif report_type == "semiannual_report":
        report_url = result.select(".cell4 a")[0]['href']
        report_url = urljoin(reports_page, report_url)
        report_id = os.path.splitext(report_url.split("/")[-1])[0]

        stamps = result.select(".cell2")[0].text.strip().split()
        # the agency can mess up the date order
        if stamps[2] == "09.30.2013":
            stamp = stamps[0]
        else:
            stamp = stamps[2]
        published_on = datetime.datetime.strptime(stamp, "%m.%d.%Y")

        title = str.join(" ", stamps)

    elif report_type == "case":
        report_type = "investigation"
        title = result.select("div")[0].text

        id_text = None
        summary = ""
        for p in result.select("p"):
            text = inspector.sanitize(p.text.strip())
            summary += text + "\n\n"
            if text.lower().strip("-").strip().startswith(
                ("case id", "case d")):
                id_text = text
        summary = summary.strip()
        if not id_text:
            for div in result.select("div"):
                text = inspector.sanitize(div.text.strip())
                if text.lower().strip("-").strip().startswith(
                    ("case id", "case d")):
                    id_text = text
        if not id_text:
            match = re.search("Case I?D: ([0-9]{4}-[0-9]{3})", title)
            if match:
                id_text = match.group(1)
        if not id_text:
            raise Exception("Could not find Case ID for an investigation\n%s" %
                            result.text)

        # note that some cases have more than one id. We are taking only the last id.
        report_id = re.sub(r'\([^)]*\)', '', id_text).strip().split(" ")[-1]

        landing_url = reports_page
        unreleased = True
        report_url = None

        date_match = DATE_RE.match(result.text.replace(title, "").strip())
        if date_match:
            published_on = datetime.datetime.strptime(date_match.group(0),
                                                      "%Y-%m-%d")
        else:
            year = int(report_id.replace("\u2010", "-").split("-")[0])
            published_on = datetime.date(year, 1, 1)
            estimated_date = True

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'cncs',
        'inspector_url': 'https://www.cncsoig.gov',
        'agency': 'cncs',
        'agency_name': 'Corporation for National and Community Service',
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'type': report_type,
        'published_on':
        datetime.datetime.strftime(published_on,
                                   "%Y-%m-%d"),  # Date of publication
    }

    if unreleased:
        report['unreleased'] = True

    if summary:
        report['summary'] = summary

    if landing_url:
        report['landing_url'] = landing_url

    if estimated_date:
        report['estimated_date'] = estimated_date

    return report
Example #27
0
def clean_text(text):
    return re.sub("[ \n]+", " ", inspector.sanitize(text))
Example #28
0
def report_from(result, reports_page, report_type, year_range):
  unreleased = False
  summary = None
  landing_url = None
  estimated_date = False

  # audits have some data, but link to landing page for summary and URL
  if report_type == "audit":
    landing_a = result.select(".cell3 a")[0]
    landing_url = urljoin(reports_page, landing_a['href'])
    long_title = landing_a.text.strip()

    # https://www.cncsoig.gov/news-entry/97-09 and
    # https://www.cncsoig.gov/news-entry/97-09-0 are duplicates of each other
    if landing_url == "https://www.cncsoig.gov/news-entry/97-09-0":
      return

    # PDF URL and summary are on the report's landing page
    report_url, summary, title = extract_from_release_page(landing_url)
    if not report_url:
      unreleased = True

    if not title:
      title = long_title

    # the report PDF URL can be pulled from the comments
    # we're ignoring this since we're going to the landing page anyhow.
    # re.search("href=\"(/sites/default/files/.*?)\">GO", str(result))

    report_id = result.select(".cell1")[0].text.strip()
    stamp = result.select(".cell2")[0].text.strip()
    published_on = datetime.datetime.strptime(stamp, "%m.%d.%Y")

  elif report_type == "investigation":
    stamp = result.select(".cell2")[0].text.strip()
    published_on = datetime.datetime.strptime(stamp, "%Y-%m-%d")
    title = result.select(".cell3 p")[0].text.strip()

    report_url = result.select(".cell3 a")[0]['href']
    report_url = urljoin(reports_page, report_url)
    report_id = os.path.splitext(report_url.split("/")[-1])[0]

  elif report_type == "semiannual_report":
    report_url = result.select(".cell4 a")[0]['href']
    report_url = urljoin(reports_page, report_url)
    report_id = os.path.splitext(report_url.split("/")[-1])[0]

    stamps = result.select(".cell2")[0].text.strip().split()
    # the agency can mess up the date order
    if stamps[2] == "09.30.2013":
      stamp = stamps[0]
    else:
      stamp = stamps[2]
    published_on = datetime.datetime.strptime(stamp, "%m.%d.%Y")

    title = str.join(" ", stamps)


  elif report_type == "case":
    report_type = "investigation"
    title = result.select("div")[0].text

    id_text = None
    summary = ""
    for p in result.select("p"):
      text = inspector.sanitize(p.text.strip())
      summary += text + "\n\n"
      if text.lower().strip("-").strip().startswith("case id"):
        id_text = text
    summary = summary.strip()
    if not id_text:
      for div in result.select("div"):
        text = inspector.sanitize(div.text.strip())
        if text.lower().strip("-").strip().startswith("case id"):
          id_text = text
    if not id_text:
      raise Exception("Could not find Case ID for an investigation\n%s" % \
                        result.text)

    #note that some cases have more than one id. We are taking only the last id.
    report_id = re.sub(r'\([^)]*\)','',id_text).strip().split(" ")[-1]

    landing_url = reports_page
    unreleased = True
    report_url = None

    year = int(report_id.replace("\u2010", "-").split("-")[0])
    published_on = datetime.date(year, 1, 1)
    estimated_date = True

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'cncs',
    'inspector_url': 'https://www.cncsoig.gov',
    'agency': 'cncs',
    'agency_name': 'Corporation for National and Community Service',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'type': report_type,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),  # Date of publication
  }

  if unreleased:
    report['unreleased'] = True

  if summary:
    report['summary'] = summary

  if landing_url:
    report['landing_url'] = landing_url

  if estimated_date:
    report['estimated_date'] = estimated_date

  return report
Example #29
0
def report_from(result, year_range, topic, subtopic_url, subtopic=None):
    # Ignore links to other subsections
    if result.get('class') and result['class'][0] == 'crossref':
        return

    if result.name == 'a':
        # Sometimes we already have a link
        result_link = result
    else:
        result_link = result.find("a")

    # No link found, this is probably just an extra <li> on the page.
    if result_link is None:
        return

    # If this is just a anchor link on the same page, skip
    if not strip_url_fragment(result_link['href']):
        return

    title = result_link.text
    title = title.replace("\xe2\x80\x93", "-")
    title = inspector.sanitize(title)
    title = re.sub('\s+', ' ', title)
    if title in TITLE_NORMALIZATION:
        title = TITLE_NORMALIZATION[title]

    if title in BLACKLIST_TITLES:
        return

    report_url = urljoin(subtopic_url, result_link['href']).strip()

    if report_url in REPORT_URL_MAPPING:
        report_url = REPORT_URL_MAPPING[report_url]

    # Ignore reports from other sites
    if BASE_URL not in report_url:
        return

    if report_url in BLACKLIST_REPORT_URLS:
        return

    if report_url in OEI_COMBINED_LANDING_PAGES:
        report_url = OEI_COMBINED_LANDING_PAGES[report_url][title]

    report_filename = report_url.split("/")[-1]
    report_id, extension = os.path.splitext(report_filename)

    if report_filename == "11302505.pdf":
        report_id = report_id + "_early_alert"

    # Try a quick check from the listing page to see if we can bail out based on
    # the year
    try:
        published_on_text = result.find_previous("dt").text.strip()
        published_on = datetime.datetime.strptime(published_on_text,
                                                  "%m-%d-%Y")
    except (AttributeError, ValueError):
        published_on = None

    if published_on and published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    # This report is listed twice, once with the wrong date
    if published_on and published_on.year == 2012 and published_on.month == 1 \
        and published_on.date == 12 and report_id == "20901002":
        return

    if report_id in REPORT_PUBLISHED_MAPPING:
        published_on = REPORT_PUBLISHED_MAPPING[report_id]
    else:
        # Process reports with landing pages
        if extension.lower() != '.pdf':
            report_url, published_on = report_from_landing_url(report_url)
        else:
            published_on = published_on_from_inline_link(
                result,
                report_filename,
                title,
                report_id,
                report_url,
            )

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    result = {
        'inspector': 'hhs',
        'inspector_url': 'http://oig.hhs.gov',
        'agency': 'hhs',
        'agency_name': 'Health & Human Services',
        'report_id': report_id,
        'topic': topic.strip(),
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if subtopic:
        result['subtopic'] = subtopic
    return result
Example #30
0
def report_from(result, year_range, report_type, title_prefix=None):
    report_url = urljoin(REPORTS_URL, result.select("a")[-1].get("href"))

    # Temporary hacks to account for link mistakes
    if report_url == "http://www.fec.gov/fecig/documents/Semi14a_000.pdf":
        report_url = "http://www.fec.gov/fecig/documents/Semi14a.pdf"
    if report_url == "http://www.fec.gov/fecig/documents/ReviewofOutstanding" \
                     "RecommendationsasofJune2014_001.pdf":
        report_url = "http://www.fec.gov/general/documents/ReviewofOutstanding" \
                     "RecommendationsasofJune2014.pdf"

    report_filename = report_url.split("/")[-1]
    report_id, extension = os.path.splitext(report_filename)

    published_on = None
    if report_url.endswith(".pdf"):
        # Inline report
        title = inspector.sanitize(result.contents[0].strip().rstrip("-"))
        title = re.sub("\\s+", " ", title)
        if title.endswith((" 200", " 201")):
            # some years are split up by a <span> tag
            title = title + result.contents[1].text
    else:
        # Some pages have separate landing pages.
        doc = utils.beautifulsoup_from_url(report_url)
        title = doc.select("h3")[1].text.strip()
        try:
            published_on_text = doc.select("h3")[2].text.strip()
        except IndexError:
            published_on_text = doc.select("h3")[1].text.strip()
        published_on_text = published_on_text.replace("Period ending ", "")
        published_on = datetime.datetime.strptime(published_on_text,
                                                  '%B %d, %Y')

    if title == "November 2016" and report_url == "http://www.fec.gov/fecig/documents/OIGSemiannualReporttoCongress-May2016-FinalPublicDistribution.pdf":
        # Fix copy-paste error
        report_url = "http://www.fec.gov/fecig/documents/OIGFall2016SARFINAL.pdf"
        report_filename = report_url.split("/")[-1]
        report_id, extension = os.path.splitext(report_filename)

    if not published_on:
        if report_id in REPORT_PUBLISHED_MAPPING:
            published_on = REPORT_PUBLISHED_MAPPING[report_id]
    if not published_on:
        try:
            published_on_text = "-".join(
                re.search('(\w+)\s+(\d{4})', title).groups())
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B-%Y')
        except (ValueError, AttributeError):
            pass

    if title_prefix:
        title = "{}{}".format(title_prefix, title)

    if not published_on:
        admin.log_no_date("fec", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': "fec",
        'inspector_url': "http://www.fec.gov/fecig/fecig.shtml",
        'agency': "fec",
        'agency_name': "Federal Election Commission",
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on':
        datetime.datetime.strftime(published_on,
                                   "%Y-%m-%d"),  # Date of publication
    }
    return report
Example #31
0
def remove_linebreaks(s):
  #lots of weird tabs, etc. inside HTML strings. would replace all at once, but since utils.beautifulsoup_from_url
  #is taking the html straight to soup, we'll do it individually for the fields we need
  return inspector.sanitize(s.replace('\n','').replace('\t','').replace('\r',''))