Example #1
0
def process_restricted_report(div, year_range, REPORTS_URL):

  title = div.contents[0]
  span = div.div.span.string.strip()
  report_number = span.split(': ')[0]
  report_date = parse_date(span.split(': ')[-1])

  if not report_date:
    admin.log_no_date("gaoreports", report_number, title)
    return

  if report_date.year not in year_range:
    return None

  report = {
    'inspector': 'gaoreports',
    'inspector_url': 'https://www.gao.gov',
    # often GAO reports do focus on a program in a specific external agency,
    # but we're not attempting to discern it in a structured way.
    # We'll just have GAO for the inspector and the agency.
    'agency': 'gao',
    'agency_name': 'Government Accountability Office',
    'report_id': report_number,
    'unreleased': True,
    'landing_url': REPORTS_URL,
    'title': title,
    'type': 'Unreleased report',
    'published_on': datetime.datetime.strftime(report_date, "%Y-%m-%d"),

  }

  return report
Example #2
0
def report_from(all_text, link_text, link_url, page_url, published_on, paragraph):
  report = {
    'inspector': 'exim',
    'inspector_url': 'http://www.exim.gov/about/oig',
    'agency': 'exim',
    'agency_name': 'Export-Import Bank of the United States'
  }

  link_text = link_text.strip()
  link_url = urljoin(page_url, link_url)

  all_text = all_text.strip()
  report_type = type_for(page_url, all_text, paragraph)

  url_match = IDENTIFIER_RE_URL.search(link_url)
  text_match = IDENTIFIER_RE_TEXT.search(all_text)
  if url_match:
    report_id = url_match.group(1)
  elif text_match:
    report_id = text_match.group(1)
  elif (page_url == PRESS_RELEASES_URL or
        page_url == INSPECTIONS_EVALUATIONS_SPECIAL_REPORTS_URL):
    report_id = link_text.replace(":", "")
  elif page_url == SEMIANNUAL_REPORTS_AND_TESTIMONIES_URL:
    report_id = os.path.splitext(link_text)[0]
  elif (page_url == AUDIT_REPORTS_URL and
        paragraph.find_previous_sibling("h2").text == "Peer Review Reports"):
    report_id = link_text
  else:
    raise Exception("No report ID found for %r" % link_text)
  # clip report_id if it gets too long
  report_id = report_id[:100]

  if published_on is None:
    admin.log_no_date("exim", report_id, link_text, link_url)
    return

  if link_url.endswith(".pdf"):
    file_type = "pdf"
  elif link_url.endswith(".docx"):
    file_type = "docx"
  elif link_url.endswith((".htm", ".html")):
    file_type = "htm"
  elif link_url.endswith(".cfm"):
    file_type = "htm"
    report['unreleased'] = True
    report['missing'] = True
  elif not os.path.splitext(os.path.basename(link_url))[1]:
    file_type = "htm"
  else:
    raise Exception("Unable to guess file type\n%r" % link_url)

  report['type'] = report_type
  report['published_on'] = datetime.strftime(published_on, "%Y-%m-%d")
  report['url'] = link_url
  report['report_id'] = report_id
  report['title'] = link_text
  report['file_type'] = file_type

  return report
def report_from(result, page_url, year_range):
    tds = result.find_all("td")
    if len(tds) == 1:
        # Title row, with colspan="3"
        return
    if len(tds) == 0:
        # Degenerate row
        return
    if tds[1]["align"] == "Center":
        # Column headers
        return
    if not result.text.strip():
        # Empty spacer row
        return

    if tds[1].p is not None:
        title = tds[1].p.contents[0]
    else:
        title = tds[1].text
    title = re.sub("\\s+", " ", title).strip()

    links = [a["href"] for a in result.find_all("a")]
    if len(links) > 1:
        links = [link for link in links if not RE_EXTRA_FILES.search(link)]
    if len(links) == 0:
        raise Exception("Couldn't find link for {!r}".format(title))
    if len(links) > 1:
        raise Exception("Found multiple links for {!r}".format(title))
    report_url = urljoin(page_url, links[0])
    report_filename = os.path.basename(report_url)
    report_id, extension = os.path.splitext(report_filename)

    published_on_text = tds[0].text.strip()
    for date_format in DATE_FORMATS:
        try:
            published_on = datetime.datetime.strptime(published_on_text,
                                                      date_format)
            break
        except ValueError:
            pass
    else:
        admin.log_no_date("fcc", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'fcc',
        'inspector_url': 'https://www.fcc.gov/inspector-general',
        'agency': 'fcc',
        'agency_name': "Federal Communications Commission",
        'type': 'audit',
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Example #4
0
def peer_review_from(result, year_range):
  report_url = urljoin(PEER_REVIEWS_URL, result.get('href'))
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]
  else:
    admin.log_no_date("archives", report_id, result.text, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  title = "Peer Review {}".format(published_on.year)

  report = {
    'inspector': 'archives',
    'inspector_url': 'https://www.archives.gov/oig/',
    'agency': 'archives',
    'agency_name': 'National Archives and Records Administration',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'type': 'peer_review',
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
def peer_review_from(result, year_range):
    report_url = urljoin(PEER_REVIEWS_URL, result['href'])
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)

    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]
    else:
        admin.log_no_date("archives", report_id, result.text, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    title = "Peer Review {}".format(published_on.year)

    report = {
        'inspector': 'archives',
        'inspector_url': 'https://www.archives.gov/oig/',
        'agency': 'archives',
        'agency_name': 'National Archives and Records Administration',
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'type': 'peer_review',
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Example #6
0
def report_from(result, landing_url, topic, year_range, last_published_on):
  try:
    report_link = result.select("a[href]")[0]
  except IndexError as exc:
    # There is a bug for this date where it does not have a report.
    # https://www.sec.gov/about/offices/oig/inspector_general_audits_reports.shtml
    if result.text.strip() == 'Jan. 7, 1997':
      return None, None
    else:
      raise exc

  report_url = urljoin(BASE_REPORT_URL, report_link['href'])

  # HTTPS, even if they haven't updated their links yet
  report_url = re.sub("^http://www.sec.gov", "https://www.sec.gov", report_url)

  logging.debug("### Processing report %s" % report_url)
  report_filename = report_url.split("/")[-1]
  report_id = os.path.splitext(report_filename)[0]
  report_id = report_id.replace("%20", "-")
  title = report_link.text.strip()
  report_type = TOPIC_TO_REPORT_TYPE[topic]

  text_lines = [line.strip() for line in result.text.split("\n")]
  text_lines = [line for line in text_lines if line]
  published_on_text = text_lines[0].split("through")[0].strip().replace(".", "")
  published_on = published_date_for_report(published_on_text, title, report_url, last_published_on, report_id)
  if not published_on:
    admin.log_no_date("sec", report_id, title, report_url)
    return None, None

  # Skip duplicate report
  if report_id == '283fin' and published_on.year == 1999 and published_on.month == 3 and published_on.day == 16:
    return None, published_on

  # Audit Memo No. 39 is posted in two locations,
  # https://www.sec.gov/about/offices/oig/reports/audits/2005/am39.pdf and
  # https://www.sec.gov/about/oig/audit/am39.pdf,
  # skip the second one
  if report_url == 'https://www.sec.gov/about/oig/audit/am39.pdf':
    return None, published_on

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % landing_url)
    return None, published_on

  logging.debug("### Processing report %s" % report_link)

  report = {
    'report_id': report_id,
    'type': report_type,
    'topic': topic,
    'url': report_url,
    'landing_url': landing_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  add_common_fields(report)
  return report, published_on
Example #7
0
def report_from(result, page_url, year_range):
  tds = result.find_all("td")
  if len(tds) == 1:
    # Title row, with colspan="3"
    return
  if len(tds) == 0:
    # Degenerate row
    return
  if tds[1]["align"] == "Center":
    # Column headers
    return
  if not result.text.strip():
    # Empty spacer row
    return

  if tds[1].p is not None:
    title = tds[1].p.contents[0]
  else:
    title = tds[1].text
  title = re.sub("\\s+", " ", title).strip()

  links = [a["href"] for a in result.find_all("a")]
  if len(links) > 1:
    links = [link for link in links if not RE_EXTRA_FILES.search(link)]
  if len(links) == 0:
    raise Exception("Couldn't find link for {!r}".format(title))
  if len(links) > 1:
    raise Exception("Found multiple links for {!r}".format(title))
  report_url = urljoin(page_url, links[0])
  report_filename = os.path.basename(report_url)
  report_id, extension = os.path.splitext(report_filename)

  published_on_text = tds[0].text.strip()
  for date_format in DATE_FORMATS:
    try:
      published_on = datetime.datetime.strptime(published_on_text, date_format)
      break
    except ValueError:
      pass
  else:
    admin.log_no_date("fcc", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'fcc',
    'inspector_url': 'https://www.fcc.gov/inspector-general',
    'agency': 'fcc',
    'agency_name': "Federal Communications Commission",
    'type': 'audit',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #8
0
def report_from(result, landing_url, report_type, year_range):
  link = result.find("a")
  if not link:
    return

  title = link.text
  report_url = urljoin(landing_url, link.get('href'))
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  published_on = None
  try:
    published_on_text = result.select("td")[1].text.strip()
    published_on = datetime.datetime.strptime(published_on_text, '%m/%d/%y')
  except (ValueError, IndexError):
    pass

  try:
    published_on_text = result.select("td")[1].text.strip()
    published_on = datetime.datetime.strptime(published_on_text, '%m/%d/%Y')
  except (ValueError, IndexError):
    pass

  if not published_on:
    try:
      published_on_text = title.split("-")[-1].split("–")[-1].strip()
      published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')
    except ValueError:
      pass

  if not published_on:
    if report_id in REPORT_PUBLISHED_MAP:
      published_on = REPORT_PUBLISHED_MAP[report_id]

  if not published_on:
    admin.log_no_date("nea", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'nea',
    'inspector_url': 'http://arts.gov/oig',
    'agency': 'nea',
    'agency_name': 'National Endowment for the Arts',
    'type': report_type,
    'landing_url': landing_url,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if report_id in MISSING_IDS:
    report['unreleased'] = True
    report['missing'] = True
    report['url'] = None
  return report
def report_from(result, year_range):
    link = result.find("a")
    report_url = urllib.parse.unquote(link.get('href'))
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)
    report_id = urllib.parse.unquote(report_id)
    title = link.text

    report_type = None
    tag_text = None
    if "Semiannual Report to Congress" in title:
        report_type = "semiannual_report"
    else:
        for tag in result.select(".ul--tags li"):
            tag_text = tag.text.strip()
            if tag_text in REPORT_TYPE_MAP:
                report_type = REPORT_TYPE_MAP[tag_text]
                break
    if not report_type:
        raise Exception("Unrecognized report type %s" % tag_text)

    published_on = None
    if report_id in REPORT_PUBLISHED_MAPPING:
        published_on = REPORT_PUBLISHED_MAPPING[report_id]
    if not published_on:
        try:
            published_on_text = title.split("-")[-1].strip()
            published_on_text = published_on_text.replace("Sept.", "September")
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B %d, %Y')
        except ValueError:
            pass
    if not published_on:
        admin.log_no_date("peacecorps", report_id, title, report_url)
        return

    if report_id in doubled_reports:
        if doubled_reports[report_id] == 0:
            doubled_reports[report_id] += 1
        else:
            return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'peacecorps',
        'inspector_url':
        'https://www.peacecorps.gov/about/inspectors-general/',
        'agency': 'peacecorps',
        'agency_name': 'Peace Corps',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Example #10
0
def report_from(result, year_range, report_type=None):
  if result.name == 'a':
    link = result
  else:
    link = result.select("a")[-1]

  href = link['href']
  href = href.replace("file://///cftc.gov/home/dc/MWOODLAND/Desktop/", "")
  report_url = urljoin(REPORTS_URL, href)
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  title = link.text

  published_on = None
  if report_id in REPORT_PUBLISHED_MAPPING:
    published_on = REPORT_PUBLISHED_MAPPING[report_id]
  if not published_on:
    try:
      published_on_text = "/".join(re.search("(\w+) (\d+), (\d+)", title).groups())
      published_on = datetime.datetime.strptime(published_on_text, '%B/%d/%Y')
    except AttributeError:
      pass
  if not published_on:
    try:
      published_on_text = "/".join(re.search("(\w+) (\d+), (\d+)", str(link.next_sibling)).groups())
      published_on = datetime.datetime.strptime(published_on_text, '%B/%d/%Y')
    except AttributeError:
      pass
  if not published_on:
    admin.log_no_date("cftc", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  if not report_type:
    report_type = extract_report_type(title)
  if not report_type:
    report_type = extract_report_type(result.find_previous("p").text)
  if not report_type:
    report_type = "other"

  report = {
    'inspector': 'cftc',
    'inspector_url': 'http://www.cftc.gov/About/OfficeoftheInspectorGeneral/index.htm',
    'agency': 'cftc',
    'agency_name': 'Commodity Futures Trading Commission',
    'file_type': 'pdf',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'type': report_type,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #11
0
def report_from(result, landing_url, report_type, year_range):
  report_url = urljoin(landing_url, result.get('href'))
  report_url = report_url.replace("../", "")
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)
  try:
    title = result.parent.find("em").text
  except AttributeError:
    try:
      title = result.parent.contents[0].text
    except AttributeError:
      title = result.parent.contents[0]

  # There's a typo in the link for this report, it points to the wrong file
  if report_id == "Report14-28-TN-17163" and title.find("Report on the Better Basics, Inc., Literacy Program for Clay, Jefferson") != -1:
    report_url = "http://www.arc.gov/images/aboutarc/members/IG/Report14-34-AL-17208-302-12.pdf"
    report_id = "Report14-34-AL-17208-302-12"

  published_on = None
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]

  if not published_on:
    try:
      published_on_text = title.split("\u2013")[-1].strip()
      published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')
    except ValueError:
      pass

  if not published_on:
    try:
      response = utils.scraper.request(method="HEAD", url=report_url)
      last_modified = response.headers["Last-Modified"]
      published_on = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
    except ValueError:
      pass

  if not published_on:
    admin.log_no_date("arc", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'arc',
    'inspector_url': 'http://www.arc.gov/oig',
    'agency': 'arc',
    'agency_name': 'Appalachian Regional Commission',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'type': report_type,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #12
0
def report_from(result, year_range):
  link = result.find("a")
  report_url = urllib.parse.unquote(link.get('href'))
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)
  report_id = urllib.parse.unquote(report_id)
  title = link.text

  report_type = None
  tag_text = None
  if "Semiannual Report to Congress" in title:
    report_type = "semiannual_report"
  else:
    for tag in result.select(".ul--tags li"):
      tag_text = tag.text.strip()
      if tag_text in REPORT_TYPE_MAP:
        report_type = REPORT_TYPE_MAP[tag_text]
        break
  if not report_type:
    raise Exception("Unrecognized report type %s" % tag_text)

  published_on = None
  if report_id in REPORT_PUBLISHED_MAPPING:
    published_on = REPORT_PUBLISHED_MAPPING[report_id]
  if not published_on:
    try:
      published_on_text = title.split("-")[-1].strip()
      published_on_text = published_on_text.replace("Sept.", "September")
      published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')
    except ValueError:
      pass
  if not published_on:
    admin.log_no_date("peacecorps", report_id, title, report_url)
    return

  if report_id in doubled_reports:
    if doubled_reports[report_id] == 0:
      doubled_reports[report_id] += 1
    else:
      return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'peacecorps',
    'inspector_url': 'https://www.peacecorps.gov/about/inspectors-general/',
    'agency': 'peacecorps',
    'agency_name': 'Peace Corps',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #13
0
def report_from(result, landing_url, report_type, year_range):
    link = result.find('a')
    report_url = urljoin(landing_url, link['href'])
    report_id = os.path.basename(urlparse(report_url)[2]).rstrip('.pdf')

    title = re.sub("\\s+", " ", link.text).strip()
    if 'semiannual' in report_id:
        title = "Semi-Annual Report: %s" % title

    if title == "Report in Brief" or title.endswith("Determination Letter"):
        # Skip report in brief or determination letter after a full report
        return

    published_on = None
    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]

    if not published_on:
        issued_strong = result.parent.parent.parent.find("strong",
                                                         text="Issued")
        if issued_strong:
            issued_on = ISSUED_DATE_EXTRACTION.search(
                issued_strong.parent.text)
            if issued_on:
                date_fmt = "%B %d, %Y"
                published_on = datetime.datetime.strptime(
                    issued_on.group(0), date_fmt)

    if not published_on:
        published_on = extract_date_from_report_id(report_id)

    if not published_on:
        admin.log_no_date("cpb", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'cpb',
        'inspector_url': 'http://www.cpb.org/oig/',
        'agency': 'cpb',
        'agency_name': 'Corporation for Public Broadcasting',
        'file_type': 'pdf',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
        'unreleased': False,
    }

    return report
Example #14
0
def report_from(result, report_type, base_url, year_range):
    link = result.find("a")
    if not link and result.text.strip() == ARCHIVE_PREAMBLE_TEXT:
        return

    report_url = urllib.parse.urljoin(base_url, link.get('href'))
    report_id, title = link.text.split(maxsplit=1)
    report_id = report_id.rstrip(":").rstrip(",")

    if report_url == AUDIT_REPORTS_ARCHIVE_URL:
        return

    if report_id == "OIG-F-21-17-01" and "Management Letter" in title:
        report_id += "-Management-Letter"

    title = title.strip()

    published_on = None
    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]
    else:
        for paren_text in re.findall('\((.*?)\)', title):
            try:
                published_on = datetime.datetime.strptime(
                    paren_text, '%B %d, %Y')
                break
            except ValueError:
                pass
            try:
                published_on = datetime.datetime.strptime(paren_text, '%B %Y')
                break
            except ValueError:
                pass
    if not published_on:
        admin.log_no_date("nlrb", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'nlrb',
        'inspector_url': "https://www.nlrb.gov/who-we-are/inspector-general",
        'agency': 'nlrb',
        'agency_name': "National Labor Relations Board",
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
def quarterly_report_from(result, year_range):
  report_url = result['href']
  report_filename = report_url.split("/")[-1]
  report_id, extension = os.path.splitext(report_filename)

  groupheader = result.parent.parent.parent.parent.find("div", class_="groupheader")
  year = int(groupheader.text.strip())
  if year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  title = "Quarterly Report to Congress, {}, {}".format(year, result.text.strip())

  if report_id in QUARTERLY_REPORT_DATES:
    published_on = QUARTERLY_REPORT_DATES[report_id]
  else:
    published_on = None

  if published_on is None:
    try:
      published_on = datetime.datetime.strptime(report_id, "%B_%d_%Y_Report_to_Congress")
    except ValueError:
      pass

  if published_on is None:
    try:
      published_on = datetime.datetime.strptime(report_id, "%B_%d_%Y_Report_To_Congress")
    except ValueError:
      pass

  if published_on is None:
    try:
      published_on = datetime.datetime.strptime(report_id, "%B_%d_%Y_Quarterly_Report_to_Congress")
    except ValueError:
      pass

  if published_on is None:
    admin.log_no_date("sigtarp", report_id, title, report_url)
    return

  report = {
    'inspector': 'sigtarp',
    'inspector_url': "https://www.sigtarp.gov",
    'agency': 'sigtarp',
    'agency_name': "Special Inspector General for the Troubled Asset Relief Program",
    'type': 'quarterly',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }

  return report
Example #16
0
def report_from(result, year_range):
    link = result.find("a")
    report_url = urljoin(REPORTS_URL, link.get('href'))
    report_url = urlunparse(list(urlparse(report_url)[:4]) + ["", ""])
    if report_url in BLACKLIST_REPORT_URLS:
        return

    # Follow redirects to get real file names
    if report_url.startswith("https://www.cpsc.gov/Media/"):
        report_url = utils.resolve_redirect(report_url)

    # URLs with /PageFiles in them need to use the filename and its
    # directory to be unique. Other URLs can just use the filename.
    if "PageFiles" in report_url:
        # e.g. /../132643/fy11fisma.pdf -> 132643-fy11fisma.pdf
        report_filename = str.join("-", report_url.split("/")[-2:])
    else:
        report_filename = report_url.split("/")[-1]

    report_id, _ = os.path.splitext(report_filename)

    title = link.text
    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]
    else:
        date_spans = result.select(".date-display-single")
        if date_spans:
            published_on_text = date_spans[0].text
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%A, %B %d, %Y')
        else:
            admin.log_no_date("cpsc", report_id, title, report_url)
            return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report_type = report_type_from_title(title)

    report = {
        'inspector': 'cpsc',
        'inspector_url': 'https://www.cpsc.gov/About-CPSC/Inspector-General/',
        'agency': 'cpsc',
        'agency_name': 'Consumer Product Safety Commission',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Example #17
0
def report_from(result, landing_url, report_type, year_range):
    link = result.find("a")
    report_url = urljoin(landing_url, link.get('href'))
    title = link.text

    if report_url in REPORT_URL_MAPPING:
        report_url = REPORT_URL_MAPPING[report_url]

    if report_url in BLACKLIST_REPORT_URLS:
        return

    try:
        report_id = result.select("td")[0].text
    except IndexError:
        try:
            report_id = result.select("li")[0].text
        except IndexError:
            report_filename = report_url.split("/")[-1]
            report_id, _ = os.path.splitext(report_filename)

    published_on = None
    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]
    if not published_on:
        try:
            published_on_text = title.split("-")[-1].strip()
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B %d, %Y')
        except ValueError:
            pass

    if not published_on:
        admin.log_no_date("fmc", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'fmc',
        'inspector_url':
        'http://www.fmc.gov/bureaus_offices/office_of_inspector_general.aspx',
        'agency': 'fmc',
        'agency_name': 'Federal Maritime Commission',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Example #18
0
def report_from(result, report_type, base_url, year_range):
  link = result.find("a")
  if not link and result.text.strip() == ARCHIVE_PREAMBLE_TEXT:
    return

  report_url = urllib.parse.urljoin(base_url, link.get('href'))
  report_id, title = link.text.split(maxsplit=1)
  report_id = report_id.rstrip(":").rstrip(",")

  if report_url == AUDIT_REPORTS_ARCHIVE_URL:
    return

  if report_id == "OIG-F-21-17-01" and "Management Letter" in title:
    report_id += "-Management-Letter"

  title = title.strip()

  published_on = None
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]
  else:
    for paren_text in re.findall('\((.*?)\)', title):
      try:
        published_on = datetime.datetime.strptime(paren_text, '%B %d, %Y')
        break
      except ValueError:
        pass
      try:
        published_on = datetime.datetime.strptime(paren_text, '%B %Y')
        break
      except ValueError:
        pass
  if not published_on:
    admin.log_no_date("nlrb", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'nlrb',
    'inspector_url': "https://www.nlrb.gov/who-we-are/inspector-general",
    'agency': 'nlrb',
    'agency_name': "National Labor Relations Board",
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #19
0
def report_from(result, landing_url, report_type, year_range):
  link = result.find('a')
  report_url = urljoin(landing_url, link['href'])
  report_id = os.path.basename(urlparse(report_url)[2]).rstrip('.pdf')

  title = re.sub("\\s+", " ", link.text).strip()
  if 'semiannual' in report_id:
    title = "Semi-Annual Report: %s" % title

  if title == "Report in Brief" or title.endswith("Determination Letter"):
    # Skip report in brief or determination letter after a full report
    return

  published_on = None
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]

  if not published_on:
    issued_strong = result.parent.parent.parent.find("strong", text="Issued")
    if issued_strong:
      issued_on = ISSUED_DATE_EXTRACTION.search(issued_strong.parent.text)
      if issued_on:
        date_fmt = "%B %d, %Y"
        published_on = datetime.datetime.strptime(issued_on.group(0), date_fmt)

  if not published_on:
    published_on = extract_date_from_report_id(report_id)

  if not published_on:
    admin.log_no_date("cpb", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'cpb',
    'inspector_url': 'http://www.cpb.org/oig/',
    'agency': 'cpb',
    'agency_name': 'Corporation for Public Broadcasting',
    'file_type': 'pdf',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    'unreleased': False,
  }

  return report
Example #20
0
def report_from(result, year_range):
  link = result.find("a")
  report_url = urljoin(REPORTS_URL, link.get('href'))
  report_url = urlunparse(list(urlparse(report_url)[:4]) + ["", ""])
  if report_url in BLACKLIST_REPORT_URLS:
    return

  # Follow redirects to get real file names
  if report_url.startswith("https://www.cpsc.gov/Media/"):
    report_url = utils.resolve_redirect(report_url)

  # URLs with /PageFiles in them need to use the filename and its
  # directory to be unique. Other URLs can just use the filename.
  if "PageFiles" in report_url:
    # e.g. /../132643/fy11fisma.pdf -> 132643-fy11fisma.pdf
    report_filename = str.join("-", report_url.split("/")[-2:])
  else:
    report_filename = report_url.split("/")[-1]

  report_id, _ = os.path.splitext(report_filename)

  title = link.text
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]
  else:
    date_spans = result.select(".date-display-single")
    if date_spans:
      published_on_text = date_spans[0].text
      published_on = datetime.datetime.strptime(published_on_text, '%A, %B %d, %Y')
    else:
      admin.log_no_date("cpsc", report_id, title, report_url)
      return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report_type = report_type_from_title(title)

  report = {
    'inspector': 'cpsc',
    'inspector_url': 'https://www.cpsc.gov/About-CPSC/Inspector-General/',
    'agency': 'cpsc',
    'agency_name': 'Consumer Product Safety Commission',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #21
0
def report_from(result, landing_url, report_type, year_range):
  link = result.find("a")
  report_url = urljoin(landing_url, link.get('href'))
  title = link.text

  if report_url in REPORT_URL_MAPPING:
    report_url = REPORT_URL_MAPPING[report_url]

  if report_url in BLACKLIST_REPORT_URLS:
    return

  try:
    report_id = result.select("td")[0].text
  except IndexError:
    try:
      report_id = result.select("li")[0].text
    except IndexError:
      report_filename = report_url.split("/")[-1]
      report_id, _ = os.path.splitext(report_filename)

  published_on = None
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]
  if not published_on:
    try:
      published_on_text = title.split("-")[-1].strip()
      published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')
    except ValueError:
      pass

  if not published_on:
    admin.log_no_date("fmc", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'fmc',
    'inspector_url': 'http://www.fmc.gov/bureaus_offices/office_of_inspector_general.aspx',
    'agency': 'fmc',
    'agency_name': 'Federal Maritime Commission',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #22
0
def report_from(result, landing_url, report_type, year_range):
  report_url = urljoin(landing_url, result.get('href'))
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)
  report_id = unquote(report_id)
  report_id = "-".join(report_id.split())
  report_id = report_id.replace("_", "-")

  title = clean_text(result.text)
  if not title:
    return

  published_on = None
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]
  if not published_on:
    try:
      published_on_text = "-".join(re.findall('(\w+) (\d+), (\d{4})', title)[-1])
      published_on = datetime.datetime.strptime(published_on_text, '%B-%d-%Y')
    except IndexError:
      pass
  if not published_on:
    try:
      published_on_text = "-".join(re.search('(\d+) (\w+) (\d{4})', title).groups())
      published_on = datetime.datetime.strptime(published_on_text, '%d-%B-%Y')
    except (AttributeError, ValueError):
      pass
  if not published_on:
    admin.log_no_date("eac", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'eac',
    'inspector_url': 'http://www.eac.gov/inspector_general/',
    'agency': 'eac',
    'agency_name': 'Election Assistance Commission',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #23
0
def semiannual_report_from(result, year_range):
    report_url = result.select("a")[0].get('href')
    report_filename = report_url.split("/")[-1]
    report_id = os.path.splitext(report_filename)[0]
    summary = result.select("p")[0].text
    title = result.select("h2 > a")[0].text
    published_on = None
    try:
        published_on = datetime.datetime.strptime(
            title.split("-")[-1].strip(), '%B %d, %Y')
    except ValueError:
        pass
    if published_on is None:
        try:
            published_on = datetime.datetime.strptime(
                title.split(" to ")[-1].strip(), '%B %d, %Y')
        except ValueError:
            pass
    if published_on is None:
        try:
            published_on = datetime.datetime.strptime(
                title.split("\u2013")[-1].strip(), '%B %d, %Y')
        except ValueError:
            pass
    if published_on is None:
        admin.log_no_date("va", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_id)
        return

    report = {
        'inspector': 'va',
        'inspector_url': 'https://www.va.gov/oig',
        'agency': 'VA',
        'agency_name': "Department of Veterans Affairs",
        'type': 'semiannual_report',
        'report_id': report_id,
        'url': report_url,
        'topic': "Semiannual Report",
        'summary': summary,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Example #24
0
def report_from(result, landing_url, report_type, year_range):
    report_url = urljoin(landing_url, result.get("href"))
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)
    report_id = unquote(report_id)
    report_id = "-".join(report_id.split())

    title = clean_text(result.text)

    published_on = None
    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]
    if not published_on:
        try:
            published_on_text = "-".join(re.findall("(\w+) (\d+), (\d{4})", title)[-1])
            published_on = datetime.datetime.strptime(published_on_text, "%B-%d-%Y")
        except IndexError:
            pass
    if not published_on:
        try:
            published_on_text = "-".join(re.search("(\d+) (\w+) (\d{4})", title).groups())
            published_on = datetime.datetime.strptime(published_on_text, "%d-%B-%Y")
        except (AttributeError, ValueError):
            pass
    if not published_on:
        admin.log_no_date("eac", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        "inspector": "eac",
        "inspector_url": "http://www.eac.gov/inspector_general/",
        "agency": "eac",
        "agency_name": "Election Assistance Commission",
        "type": report_type,
        "report_id": report_id,
        "url": report_url,
        "title": title,
        "published_on": datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Example #25
0
def semiannual_report_from(result, year_range):
  report_url = result.select("a")[0].get('href')
  report_filename = report_url.split("/")[-1]
  report_id = os.path.splitext(report_filename)[0]
  summary = result.select("p")[0].text
  title = result.select("h2 > a")[0].text
  published_on = None
  try:
    published_on = datetime.datetime.strptime(title.split("-")[-1].strip(), '%B %d, %Y')
  except ValueError:
    pass
  if published_on is None:
    try:
      published_on = datetime.datetime.strptime(title.split(" to ")[-1].strip(), '%B %d, %Y')
    except ValueError:
      pass
  if published_on is None:
    try:
      published_on = datetime.datetime.strptime(title.split("\u2013")[-1].strip(), '%B %d, %Y')
    except ValueError:
      pass
  if published_on is None:
    admin.log_no_date("va", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_id)
    return

  report = {
    'inspector': 'va',
    'inspector_url': 'https://www.va.gov/oig',
    'agency': 'VA',
    'agency_name': "Department of Veterans Affairs",
    'type': 'semiannual_report',
    'report_id': report_id,
    'url': report_url,
    'topic': "Semiannual Report",
    'summary': summary,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #26
0
def other_report_from(result, year_range):
    link = result.find("a")
    basename = os.path.splitext(os.path.basename(link["href"]))[0]
    report_id = clean_text(basename).replace("'", "").replace(":", "")
    report_id = re.sub("-+", "-", report_id)
    report_url = urljoin(OTHER_REPORTS_URL, link["href"])

    match = OTHER_REPORT_RE.match(clean_text(link.text))
    title = match.group(1)
    published_on_text = match.group(2)
    published_on = None
    try:
        published_on = datetime.datetime.strptime(published_on_text,
                                                  "%B %d, %Y")
    except ValueError:
        pass
    if not published_on:
        try:
            published_on = datetime.datetime.strptime(published_on_text,
                                                      "%b. %d, %Y")
        except ValueError:
            pass
    if not published_on:
        admin.log_no_date("ncua", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': "ncua",
        'inspector_url': HOMEPAGE_URL,
        'agency': "ncua",
        'agency_name': "National Credit Union Administration",
        'type': "other",
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Example #27
0
def other_report_from(result, year_range):
    link = result.find("a")
    basename = os.path.splitext(os.path.basename(link["href"]))[0]
    report_id = clean_text(basename).replace("'", "").replace(":", "")
    report_id = re.sub("-+", "-", report_id)
    report_url = urljoin(OTHER_REPORTS_URL, link["href"])

    match = OTHER_REPORT_RE.match(clean_text(link.text))
    title = match.group(1)
    published_on_text = match.group(2)
    published_on = None
    try:
        published_on = datetime.datetime.strptime(published_on_text, "%B %d, %Y")
    except ValueError:
        pass
    if not published_on:
        try:
            published_on = datetime.datetime.strptime(published_on_text, "%b. %d, %Y")
        except ValueError:
            pass
    if not published_on:
        admin.log_no_date("ncua", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        "inspector": "ncua",
        "inspector_url": HOMEPAGE_URL,
        "agency": "ncua",
        "agency_name": "National Credit Union Administration",
        "type": "other",
        "report_id": report_id,
        "url": report_url,
        "title": title,
        "published_on": datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Example #28
0
def report_from(result, year_range, report_type):
  path = result.get("href")
  html_report_url = urljoin(INSPECTOR_URL, path)
  html_report = utils.beautifulsoup_from_url(html_report_url)
  report_id = path.split('/')[-1]
  title = html_report.find("span", {"property": "dc:title"})['content']
  fiscal_year = fiscal_year_parse(html_report)

  links = html_report.select(".file a")
  hrefs = filter_links(links)
  if len(hrefs) > 1:
    raise Exception("Found multiple links on {}:\n{}".format(html_report_url,
                                                             hrefs))
  if len(hrefs) == 0:
    raise Exception("Found no links on {}".format(html_report_url))
  pdf_report_url = hrefs[0]

  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]
  else:
    admin.log_no_date("eeoc", report_id, title, pdf_report_url)
    return

  if fiscal_year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % pdf_report_url)
    return

  report = {
    'inspector': "eeoc",
    'inspector_url': INSPECTOR_URL,
    'agency': "eeoc",
    'agency_name': "Equal Employment Opportunity Commission",
    'report_id': report_id,
    'url': pdf_report_url,
    'title': title,
    'type': report_type,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }

  return report
Example #29
0
def report_from(result, year_range):
  # walk backwards through the doc to find the header title
  for element in result.previous_elements:
    if element and \
            isinstance(element, Tag) and \
            element.name == "span" and \
            element.has_attr("class") and \
            "collapseomatic" in element["class"]:
      header = element.text.strip().lower()
      break
  else:
    raise Exception("Couldn't find the header for %s" % result)

  if header.startswith("inspection"):
    category = "inspection"
  elif header.startswith("semiannual"):
    category = "semiannual_report"
  else:
    category = "other"

  report_id = os.path.splitext(os.path.basename(result['href']))[0]
  report_url = urljoin(REPORTS_URL, result['href'].strip())
  title = inspector.sanitize(result.text)

  # Each financial/performance report is linked twice, once for the IG's
  # transmittal letter and independent auditor's report, and once for
  # the IG's "Perspective on Management and Performance Challenges."
  # Skip the first one and save the second
  if "IG's Transmittal Letter and Independent Auditor's Report" in title \
          and "(pages" in title:
    return None
  elif title == "Hotline Poster":
    return None

  published_on = REPORT_PUBLISHED_MAPPING.get(title)
  if not published_on:
    published_on = REPORT_PUBLISHED_MAPPING.get(report_id)

  if not published_on:
    date_match = DATE_RE.match(title)
    if date_match:
      published_on = datetime.datetime.strptime(date_match.group(1), "%Y.%m")
      if date_match.lastindex == 2:
        title = date_match.group(2)
      elif header.startswith("semiannual"):
        title = published_on.strftime("Semiannual Report to Congress, %B %Y")
      else:
        raise Exception("No good title for %s" % report_id)

  if not published_on:
    admin.log_no_date("denali", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': "denali",
    'inspector_url': "http://www.oig.denali.gov",
    'agency': "denali",
    'agency_name': "Denali Commission",
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'type': category,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }

  return report
Example #30
0
def report_from(result, year_range, report_type, title_prefix=None):
    report_url = urljoin(REPORTS_URL, result.select("a")[-1].get("href"))

    # Temporary hacks to account for link mistakes
    if report_url == "http://www.fec.gov/fecig/documents/Semi14a_000.pdf":
        report_url = "http://www.fec.gov/fecig/documents/Semi14a.pdf"
    if report_url == "http://www.fec.gov/fecig/documents/ReviewofOutstanding" \
                     "RecommendationsasofJune2014_001.pdf":
        report_url = "http://www.fec.gov/general/documents/ReviewofOutstanding" \
                     "RecommendationsasofJune2014.pdf"

    report_filename = report_url.split("/")[-1]
    report_id, extension = os.path.splitext(report_filename)

    published_on = None
    if report_url.endswith(".pdf"):
        # Inline report
        title = inspector.sanitize(result.contents[0].strip().rstrip("-"))
        title = re.sub("\\s+", " ", title)
        if title.endswith((" 200", " 201")):
            # some years are split up by a <span> tag
            title = title + result.contents[1].text
    else:
        # Some pages have separate landing pages.
        doc = utils.beautifulsoup_from_url(report_url)
        title = doc.select("h3")[1].text.strip()
        try:
            published_on_text = doc.select("h3")[2].text.strip()
        except IndexError:
            published_on_text = doc.select("h3")[1].text.strip()
        published_on_text = published_on_text.replace("Period ending ", "")
        published_on = datetime.datetime.strptime(published_on_text,
                                                  '%B %d, %Y')

    if title == "November 2016" and report_url == "http://www.fec.gov/fecig/documents/OIGSemiannualReporttoCongress-May2016-FinalPublicDistribution.pdf":
        # Fix copy-paste error
        report_url = "http://www.fec.gov/fecig/documents/OIGFall2016SARFINAL.pdf"
        report_filename = report_url.split("/")[-1]
        report_id, extension = os.path.splitext(report_filename)

    if not published_on:
        if report_id in REPORT_PUBLISHED_MAPPING:
            published_on = REPORT_PUBLISHED_MAPPING[report_id]
    if not published_on:
        try:
            published_on_text = "-".join(
                re.search('(\w+)\s+(\d{4})', title).groups())
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B-%Y')
        except (ValueError, AttributeError):
            pass

    if title_prefix:
        title = "{}{}".format(title_prefix, title)

    if not published_on:
        admin.log_no_date("fec", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': "fec",
        'inspector_url': "http://www.fec.gov/fecig/fecig.shtml",
        'agency': "fec",
        'agency_name': "Federal Election Commission",
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on':
        datetime.datetime.strftime(published_on,
                                   "%Y-%m-%d"),  # Date of publication
    }
    return report
Example #31
0
def report_from(result, landing_url, report_type, year_range):
    title = result.select("td")[-1].text
    title = re.sub("\\s+", " ", title)

    report_id_match = REPORT_ID_RE.match(result.td.text.strip())
    if ("contains sensitive information" in title
            or "This correspondence will not be posted" in title
            or title in UNPUBLISHED_REPORT_TITLES):
        unreleased = True
        report_url = None
        if report_id_match:
            report_id = report_id_match.group(0)
        else:
            report_id = inspector.slugify("-".join(
                title.strip().split())[:100])
    else:
        unreleased = False
        link = result.find("a")
        report_id = inspector.slugify(link.text.strip())
        if link.get('href') == "#":
            unreleased = True
            report_url = None
        else:
            report_url = urljoin(landing_url, link.get('href'))
            if landing_url == SEMIANNUAL_REPORTS_URL:
                if title.find("Transmittal Letter") != -1:
                    report_id = report_id + "-transmittal"

    published_on = None
    try:
        published_on = datetime.datetime.strptime(link.text.strip(),
                                                  '%m.%d.%y')
    except (ValueError, UnboundLocalError):
        pass

    if not published_on:
        if report_url:
            date_match = DATE_RE.search(report_url)
            if date_match:
                date_text = date_match.group(1)
                published_on = datetime.datetime.strptime(
                    date_text, "%m-%d-%y")

    if not published_on:
        if report_id in REPORT_PUBLISHED_MAP:
            published_on = REPORT_PUBLISHED_MAP[report_id]

    if not published_on:
        admin.log_no_date("gpo", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'gpo',
        'inspector_url': 'http://www.gpo.gov/oig/',
        'agency': 'gpo',
        'agency_name': 'Government Publishing Office',
        'file_type': 'pdf',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if unreleased:
        report['unreleased'] = unreleased
        report['landing_url'] = landing_url
    return report
Example #32
0
def report_from(result, landing_url, topic, year_range, last_published_on):
    try:
        report_link = result.select("a[href]")[0]
    except IndexError as exc:
        # There is a bug for this date where it does not have a report.
        # https://www.sec.gov/about/offices/oig/inspector_general_audits_reports.shtml
        if result.text.strip() == 'Jan. 7, 1997':
            return None, None
        else:
            raise exc

    report_url = urljoin(BASE_REPORT_URL, report_link['href'])

    # HTTPS, even if they haven't updated their links yet
    report_url = re.sub("^http://www.sec.gov", "https://www.sec.gov",
                        report_url)

    logging.debug("### Processing report %s" % report_url)
    report_filename = report_url.split("/")[-1]
    report_id = os.path.splitext(report_filename)[0]
    report_id = report_id.replace("%20", "-")
    title = report_link.text.strip()
    report_type = TOPIC_TO_REPORT_TYPE[topic]

    text_lines = [line.strip() for line in result.text.split("\n")]
    text_lines = [line for line in text_lines if line]
    published_on_text = text_lines[0].split("through")[0].strip().replace(
        ".", "")
    published_on = published_date_for_report(published_on_text, title,
                                             report_url, last_published_on,
                                             report_id)
    if not published_on:
        admin.log_no_date("sec", report_id, title, report_url)
        return None, None

    # Skip duplicate report
    if report_id == '283fin' and published_on.year == 1999 and published_on.month == 3 and published_on.day == 16:
        return None, published_on

    # Audit Memo No. 39 is posted in two locations,
    # https://www.sec.gov/about/offices/oig/reports/audits/2005/am39.pdf and
    # https://www.sec.gov/about/oig/audit/am39.pdf,
    # skip the second one
    if report_url == 'https://www.sec.gov/about/oig/audit/am39.pdf':
        return None, published_on

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % landing_url)
        return None, published_on

    logging.debug("### Processing report %s" % report_link)

    report = {
        'report_id': report_id,
        'type': report_type,
        'topic': topic,
        'url': report_url,
        'landing_url': landing_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    add_common_fields(report)
    return report, published_on
def report_from(result, year_range):
  title = result.find("em").text.strip()
  landing_url = REPORTS_URL

  hrefs = [a.get("href").strip() for a in result.find_all("a")]
  hrefs = [href for href in hrefs if href]
  if hrefs:
    unreleased = False
    report_url = urljoin(REPORTS_URL, hrefs[-1])
  else:
    unreleased = True
    report_url = None

  if report_url == "https://www.fdicig.gov/semi-reports/sar2003mar/" \
          "oigsemi-03-09.pdf":
    # This URL is a typo, results in 404
    report_url = "https://www.fdicig.gov/semi-reports/Semi2003OCT/sarOCT03.shtml"

  if report_url == "https://www.fdicig.gov/semi-reports/sar2009mar/" \
          "oigsemi-03-09.pdf" and \
          title == "FDIC Office of Inspector General's Semiannual Report to " \
          "the Congress 4/1/2009 - 9/30/2009":
    # This URL points to the wrong report
    report_url = "https://www.fdicig.gov/semi-reports/SAROCT09/" \
        "OIGSemi_FDIC_09-9-09.pdf"

  if report_url == "https://www.fdicig.gov/press/pr-08-24-12.shtml" and \
          title == "Bank President Imprisoned for Embezzlement":
    # The title and URL don't match, and both were copied from other reports,
    # so we skip this entry
    return None

  report_type_text = result.select("td")[0].text
  if report_type_text in RECORD_TYPE_BLACKLIST:
    return
  report_type = type_for_report(report_type_text)

  if report_url and report_url != GENERIC_MISSING_REPORT_URL:
    report_filename = report_url.split("/")[-1]
    report_id, extension = os.path.splitext(report_filename)
    if report_url.find("/evaluations/") != -1:
      if not report_url.endswith("e"):
        report_id = report_id + "e"
  else:
    report_id = "-".join(title.split())[:50]
    report_id = report_id.replace(":", "")

  if report_id in REPORT_PUBLISHED_MAPPING:
    published_on = REPORT_PUBLISHED_MAPPING[report_id]
  else:
    published_on_text = result.select("td")[2].text
    try:
      published_on = datetime.datetime.strptime(published_on_text, '%m/%d/%Y')
    except ValueError:
      print(result)
      if report_url:
        admin.log_no_date("fdic", report_id, title, report_url)
      else:
        admin.log_no_date("fdic", report_id, title)
      return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  missing = False
  if report_url == GENERIC_MISSING_REPORT_URL:
    missing = True
    unreleased = True
    report_url = None

  report = {
    'inspector': "fdic",
    'inspector_url': "https://www.fdicig.gov",
    'agency': "fdic",
    'agency_name': "Federal Deposit Insurance Corporation",
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if unreleased:
    report['unreleased'] = unreleased
    report['landing_url'] = landing_url
  if missing:
    report['missing'] = missing
  return report
def rss_report_from(result, year_range):
    report_url = result.find("link").next_sibling.strip()
    if report_url.rstrip("/") == 'http://www.si.edu/oig':
        # This is the default url the IG uses for announcements of things like
        # a website redesign or changes to the RSS feed.
        return

    if report_url == "http://www.si.edu/oig/OIGStratPlan.pdf":
        # This strategic plan is no longer on the website, but it is reproduced in
        # multiple semiannual reports, so we skip it here.
        return

    if report_url in RSS_BROKEN_LINKS:
        report_url = RSS_BROKEN_LINKS[report_url]
    else:
        report_url = report_url.replace("/OIG/SAR/Semiannual_Reports/",
                                        "/OIG/SAR/")
        report_url = report_url.replace("/oig/Semiannual_Reports/",
                                        "/Content/OIG/SAR/")
        report_url = report_url.replace("/oig/AuditReports/",
                                        "/Content/OIG/Audits/")
        report_url = report_url.replace("/oig/ARRA_Reports/",
                                        "/Content/OIG/Audits/")

    file_type = None
    if not report_url.endswith(".pdf"):
        file_type = "html"

    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)

    if report_id in report_ids_seen:
        return
    report_ids_seen.add(report_id)

    title = result.find("title").text
    report_type = report_type_from_url(report_url)

    published_on = None
    published_on_text = result.find("pubdate").text
    try:
        published_on = datetime.datetime.strptime(
            published_on_text, '%a, %d %b %Y %H:%M:%S %z').date()
    except ValueError:
        pass

    if not published_on:
        try:
            published_on = datetime.datetime.strptime(
                published_on_text, '%a, %d %B %Y %H:%M:%S %z').date()
        except ValueError:
            pass

    if not published_on:
        admin.log_no_date("smithsonian", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'smithsonian',
        'inspector_url': 'https://www.si.edu/OIG',
        'agency': 'smithsonian',
        'agency_name': 'Smithsonian Institution',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if file_type:
        report['file_type'] = file_type
    return report
Example #35
0
def report_from(result, year_range, topic, subtopic_url, subtopic=None):
    # Ignore links to other subsections
    if result.get('class') and result['class'][0] == 'crossref':
        return

    if result.name == 'a':
        # Sometimes we already have a link
        result_link = result
    else:
        result_link = result.find("a")

    # No link found, this is probably just an extra <li> on the page.
    if result_link is None:
        return

    # If this is just a anchor link on the same page, skip
    if not strip_url_fragment(result_link['href']):
        return

    title = result_link.text
    title = title.replace("\xe2\x80\x93", "-")
    title = inspector.sanitize(title)
    title = re.sub('\s+', ' ', title)
    if title in TITLE_NORMALIZATION:
        title = TITLE_NORMALIZATION[title]

    if title in BLACKLIST_TITLES:
        return

    report_url = urljoin(subtopic_url, result_link['href']).strip()

    if report_url in REPORT_URL_MAPPING:
        report_url = REPORT_URL_MAPPING[report_url]

    # Fix copy-paste error in link
    if (title == "Medicare Compliance Review of Altru Hospital for "
            "2012 and 2013" and report_url
            == "http://oig.hhs.gov/oas/reports/region4/41408036.asp"):
        report_url = "http://oig.hhs.gov/oas/reports/region7/71505070.asp"

    # Ignore reports from other sites
    if BASE_URL not in report_url:
        return

    if report_url in BLACKLIST_REPORT_URLS:
        return

    if report_url in OEI_COMBINED_LANDING_PAGES:
        report_url = OEI_COMBINED_LANDING_PAGES[report_url][title]

    report_filename = report_url.split("/")[-1]
    report_id, extension = os.path.splitext(report_filename)

    if report_filename == "11302505.pdf":
        report_id = report_id + "_early_alert"

    # Try a quick check from the listing page to see if we can bail out based on
    # the year
    try:
        published_on_text = result.find_previous("dt").text.strip()
        published_on = datetime.datetime.strptime(published_on_text,
                                                  "%m-%d-%Y")
    except (AttributeError, ValueError):
        published_on = None

    if published_on and published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    # This report is listed twice, once with the wrong date
    if published_on and published_on.year == 2012 and published_on.month == 1 and \
            published_on.date == 12 and report_id == "20901002":
        return

    if report_id in REPORT_PUBLISHED_MAPPING:
        published_on = REPORT_PUBLISHED_MAPPING[report_id]
    else:
        # Process reports with landing pages
        if extension.lower() != '.pdf':
            report_url, published_on = report_from_landing_url(report_url)
        else:
            published_on = published_on_from_inline_link(
                result,
                report_filename,
                title,
                report_id,
                report_url,
            )

    if not published_on:
        admin.log_no_date("hhs", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    result = {
        'inspector': 'hhs',
        'inspector_url': 'http://oig.hhs.gov',
        'agency': 'hhs',
        'agency_name': 'Health & Human Services',
        'report_id': report_id,
        'topic': topic.strip(),
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if subtopic:
        result['subtopic'] = subtopic
    return result
Example #36
0
def report_from(result, landing_url, report_type, year_range):
  link = result.find("a")

  report_url = urljoin(landing_url, link.get('href').strip())
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  title = link.text

  file_type = None
  unreleased = False
  if "Non Public Report" in title.replace("-", " "):  # Normalize title for easier detection
    unreleased = True
    landing_url = report_url
    report_url = None
  elif not report_url.endswith(".pdf"):
    # A link to an html report
    file_type = "html"

  estimated_date = False
  published_on = None
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]

  if not published_on:
    if not os.path.splitext(report_filename)[1]:
      report_doc = utils.beautifulsoup_from_url(report_url)
      if report_doc:
        time_tag = report_doc.time
        if time_tag:
          date = report_doc.time["datetime"]
          published_on = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")

  if not published_on:
    if landing_url == SEMIANNUAL_REPORTS_URL:
      fy_match = re.match("Fiscal Year ([0-9]{4})", title)
      if fy_match:
        year = int(fy_match.group(1))
        if "(First Half)" in title:
          published_on = datetime.datetime(year, 3, 31)
          estimated_date = True
        elif "(Second Half)" in title:
          published_on = datetime.datetime(year, 9, 30)
          estimated_date = True

  if not published_on:
    admin.log_no_date("ftc", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'ftc',
    'inspector_url': "https://www.ftc.gov/about-ftc/office-inspector-general",
    'agency': 'ftc',
    'agency_name': "Federal Trade Commission",
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if estimated_date:
    report['estimated_date'] = estimated_date
  if unreleased:
    report['unreleased'] = unreleased
    report['landing_url'] = landing_url
  if file_type:
    report['file_type'] = file_type
  return report
Example #37
0
def report_from(result, year_range, topic, subtopic_url, subtopic=None):
  # Ignore links to other subsections
  if result.get('class') and result['class'][0] == 'crossref':
    return

  if result.name == 'a':
    # Sometimes we already have a link
    result_link = result
  else:
    result_link = result.find("a")

  # No link found, this is probably just an extra <li> on the page.
  if result_link is None:
    return

  # If this is just a anchor link on the same page, skip
  if not strip_url_fragment(result_link['href']):
    return

  title = result_link.text
  title = title.replace("\xe2\x80\x93", "-")
  title = inspector.sanitize(title)
  title = re.sub('\s+', ' ', title)
  if title in TITLE_NORMALIZATION:
    title = TITLE_NORMALIZATION[title]

  if title in BLACKLIST_TITLES:
    return

  report_url = urljoin(subtopic_url, result_link['href']).strip()

  if report_url in REPORT_URL_MAPPING:
    report_url = REPORT_URL_MAPPING[report_url]

  # Fix copy-paste error in link
  if (title == "Medicare Compliance Review of Altru Hospital for "
          "2012 and 2013" and
          report_url == "http://oig.hhs.gov/oas/reports/region4/41408036.asp"):
    report_url = "http://oig.hhs.gov/oas/reports/region7/71505070.asp"

  # Ignore reports from other sites
  if BASE_URL not in report_url:
    return

  if report_url in BLACKLIST_REPORT_URLS:
    return

  if report_url in OEI_COMBINED_LANDING_PAGES:
    report_url = OEI_COMBINED_LANDING_PAGES[report_url][title]

  report_filename = report_url.split("/")[-1]
  report_id, extension = os.path.splitext(report_filename)

  if report_filename == "11302505.pdf":
    report_id = report_id + "_early_alert"

  # Try a quick check from the listing page to see if we can bail out based on
  # the year
  try:
    published_on_text = result.find_previous("dt").text.strip()
    published_on = datetime.datetime.strptime(published_on_text, "%m-%d-%Y")
  except (AttributeError, ValueError):
    published_on = None

  if published_on and published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  # This report is listed twice, once with the wrong date
  if published_on and published_on.year == 2012 and published_on.month == 1 and \
          published_on.date == 12 and report_id == "20901002":
    return

  if report_id in REPORT_PUBLISHED_MAPPING:
    published_on = REPORT_PUBLISHED_MAPPING[report_id]
  else:
    # Process reports with landing pages
    if extension.lower() != '.pdf':
      report_url, published_on = report_from_landing_url(report_url)
    else:
      published_on = published_on_from_inline_link(
        result,
        report_filename,
        title,
        report_id,
        report_url,
      )

  if not published_on:
    admin.log_no_date("hhs", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  result = {
    'inspector': 'hhs',
    'inspector_url': 'http://oig.hhs.gov',
    'agency': 'hhs',
    'agency_name': 'Health & Human Services',
    'report_id': report_id,
    'topic': topic.strip(),
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if subtopic:
    result['subtopic'] = subtopic
  return result
Example #38
0
def report_from(result, landing_url, report_type, year_range):
  title = result.select("td")[-1].text
  title = re.sub("\\s+", " ", title)

  report_id_match = REPORT_ID_RE.match(result.td.text.strip())
  if ("contains sensitive information" in title or
      "This correspondence will not be posted" in title or
      title == "Unscheduled and Unpaid Absenteeism in the Office of "
      "Plant Operations"):
    unreleased = True
    report_url = None
    if report_id_match:
      report_id = report_id_match.group(0)
    else:
      report_id = inspector.slugify("-".join(title.strip().split())[:100])
  else:
    unreleased = False
    link = result.find("a")
    report_id = inspector.slugify(link.text.strip())
    if link.get('href') == "#":
      unreleased = True
      report_url = None
    else:
      report_url = urljoin(landing_url, link.get('href'))
      if landing_url == SEMIANNUAL_REPORTS_URL:
        if title.find("Transmittal Letter") != -1:
          report_id = report_id + "-transmittal"

  published_on = None
  try:
    published_on = datetime.datetime.strptime(link.text.strip(), '%m.%d.%y')
  except (ValueError, UnboundLocalError):
    pass

  if not published_on:
    if report_url:
      date_match = DATE_RE.search(report_url)
      if date_match:
        date_text = date_match.group(1)
        published_on = datetime.datetime.strptime(date_text, "%m-%d-%y")

  if not published_on:
    if report_id in REPORT_PUBLISHED_MAP:
      published_on = REPORT_PUBLISHED_MAP[report_id]

  if not published_on:
    admin.log_no_date("gpo", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'gpo',
    'inspector_url': 'http://www.gpo.gov/oig/',
    'agency': 'gpo',
    'agency_name': 'Government Publishing Office',
    'file_type': 'pdf',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if unreleased:
    report['unreleased'] = unreleased
    report['landing_url'] = landing_url
  return report
Example #39
0
def report_from(result, landing_url, report_type, year_range):
  link = result.find("a")

  if link:
    title = link.text
    report_url = link.get('href')
    unreleased = False
  else:
    title = result.select("div.views-field-title")[0].text
    report_url = None
    unreleased = True

  published_on = None
  try:
    published_on_text = result.select("span.date-display-single")[0].text
    published_on = datetime.datetime.strptime(published_on_text, '%m/%d/%Y')
  except IndexError:
    pass
  if not published_on:
    try:
      title_text = result.select("div.views-field-title span")[0].text.strip()
      date_match = DATE_RE.match(title_text)
      published_on_text = date_match.group(0)
      published_on = datetime.datetime.strptime(published_on_text, "%B %d, %Y")
      title = title_text[date_match.end():]
    except (IndexError, AttributeError):
      pass

  if not published_on:
    admin.log_no_date("usaid", report_url, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  try:
    report_id_text = result.select("div.views-field-field-auditreport-doc-1")[0].text.strip()
    report_id = "-".join(report_id_text.replace("/", "-").replace(":", "").split())
  except IndexError:
    report_id = None

  if not report_id and report_url:
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)

  if not report_id:
    report_id = "{}-{}".format("-".join(title.split()), published_on_text)

  report_id = report_id.replace("/", "-")

  if title.startswith("Follow-Up"):
    report_id = report_id + "-follow-up"

  if report_url == "https://oig.usaid.gov/sites/default/files/audit-reports/" \
          "0-000-12-001-s_0.pdf":
    # Two versions of this report have been uploaded
    report_id = report_id + "_final"

  if report_url == "https://oig.usaid.gov/sites/default/files/audit-reports/" \
          "1-520-01-010-p_0.pdf":
    # This file has been uploaded twice, once with "_0" and once without
    return None

  if report_url in MISMATCHED_REPORT_URLS:
    # The report number and PDF file for these reports are copies of unrelated
    # reports
    report_id = "-".join(re.split("[^a-z]+", title.lower()))
    report_url = None
    unreleased = True

  report = {
    'inspector': "usaid",
    'inspector_url': "https://oig.usaid.gov",
    'agency': "usaid",
    'agency_name': "Agency For International Development",
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if unreleased:
    report['unreleased'] = unreleased
    report['landing_url'] = landing_url
    del report['url']
  return report
Example #40
0
def report_from(result, year_range, report_type, title_prefix=None):
  report_url = urljoin(REPORTS_URL, result.select("a")[-1].get("href"))

  # Temporary hacks to account for link mistakes
  if report_url == "http://www.fec.gov/fecig/documents/Semi14a_000.pdf":
    report_url = "http://www.fec.gov/fecig/documents/Semi14a.pdf"
  if report_url == "http://www.fec.gov/fecig/documents/ReviewofOutstanding" \
                   "RecommendationsasofJune2014_001.pdf":
    report_url = "http://www.fec.gov/general/documents/ReviewofOutstanding" \
                 "RecommendationsasofJune2014.pdf"

  report_filename = report_url.split("/")[-1]
  report_id, extension = os.path.splitext(report_filename)

  published_on = None
  if report_url.endswith(".pdf"):
    # Inline report
    title = inspector.sanitize(result.contents[0].strip().rstrip("-"))
    title = re.sub("\\s+", " ", title)
    if title.endswith((" 200", " 201")):
      # some years are split up by a <span> tag
      title = title + result.contents[1].text
  else:
    # Some pages have separate landing pages.
    doc = utils.beautifulsoup_from_url(report_url)
    title = doc.select("h3")[1].text.strip()
    try:
      published_on_text = doc.select("h3")[2].text.strip()
    except IndexError:
      published_on_text = doc.select("h3")[1].text.strip()
    published_on_text = published_on_text.replace("Period ending ", "")
    published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

  if title == "November 2016" and report_url == "http://www.fec.gov/fecig/documents/OIGSemiannualReporttoCongress-May2016-FinalPublicDistribution.pdf":
    # Fix copy-paste error
    report_url = "http://www.fec.gov/fecig/documents/OIGFall2016SARFINAL.pdf"
    report_filename = report_url.split("/")[-1]
    report_id, extension = os.path.splitext(report_filename)

  if not published_on:
    if report_id in REPORT_PUBLISHED_MAPPING:
      published_on = REPORT_PUBLISHED_MAPPING[report_id]
  if not published_on:
    try:
      published_on_text = "-".join(re.search('(\w+)\s+(\d{4})', title).groups())
      published_on = datetime.datetime.strptime(published_on_text, '%B-%Y')
    except (ValueError, AttributeError):
      pass

  if title_prefix:
    title = "{}{}".format(title_prefix, title)

  if not published_on:
    admin.log_no_date("fec", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': "fec",
    'inspector_url': "http://www.fec.gov/fecig/fecig.shtml",
    'agency': "fec",
    'agency_name': "Federal Election Commission",
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),  # Date of publication
  }
  return report
def report_from(result, year_range):
  report_url = urljoin(RECENT_AUDITS_URL, result.get('href'))
  if report_url in URL_BLACKLIST:
    return None
  # Strip extra path adjustments
  report_url = report_url.replace("../", "")

  summary = None
  if not report_url.endswith(".pdf"):
    # Some reports link to other page which link to the full report
    report_page = utils.beautifulsoup_from_url(report_url)
    relative_report_url = report_page.select("div.block a[href]")[0]['href']
    report_url = urljoin(report_url, relative_report_url)
    # Strip extra path adjustments
    report_url = report_url.replace("../", "")

    summary = "\n".join(paragraph.text for paragraph in report_page.select("div.grid_12 p"))

  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  if report_id in report_ids_seen:
    return
  report_ids_seen.add(report_id)

  title = result.text.strip()
  report_type = report_type_from_url(report_url)

  if not title:
    return None

  estimated_date = False
  published_on = None
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]

  if not published_on:
    try:
      published_on_text = "/".join(re.search('(\w+) (\d+), (\d+)', title).groups())
      published_on = datetime.datetime.strptime(published_on_text, '%B/%d/%Y')
    except AttributeError:
      pass

  if not published_on:
    month_year_match = MONTH_YEAR_RE.search(result.text)
    if month_year_match:
      date_text = ' '.join(month_year_match.group(0).split())
      published_on = datetime.datetime.strptime(date_text, '%B %Y')
      estimated_date = True

  if not published_on:
    admin.log_no_date("smithsonian", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'smithsonian',
    'inspector_url': 'https://www.si.edu/OIG',
    'agency': 'smithsonian',
    'agency_name': 'Smithsonian Institution',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if summary:
    report['summary'] = summary
  if estimated_date:
    report['estimated_date'] = estimated_date
  return report
Example #42
0
def report_from(result, landing_url, year_range):
  report_url = urljoin(landing_url, result.get('href'))

  # HTTPS, even if they haven't updated their links yet
  report_url = re.sub("^http://www.fca.gov", "https://www.fca.gov", report_url)

  if landing_url + '#' in report_url:
    # These are just anchor links, skip them.
    return

  if result.find_parent("ul") and result.find_parent("ul").get('type') == 'disc':
    # These are just anchor links, skip them.
    return

  title = clean_text(result.text)

  if title == 'Inspector General Reports':
    # Just a return link to the main IG page
    return

  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  published_on = None
  if report_id in REPORT_PUBLISHED_MAPPING:
    published_on = REPORT_PUBLISHED_MAPPING[report_id]
  if not published_on:
    try:
      li = result.parent
      if li.name == "u":
        li = li.parent
      published_on_text = li.contents[1].lstrip(",").split("(")[0].strip()
    except (IndexError, TypeError):
      published_on_text = result.text.strip()
    published_on_text = clean_text(published_on_text)

    try:
      published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')
    except ValueError:
      pass

  if not published_on:
    try:
      published_on_text = li.contents[1].strip().lstrip("(").rstrip(")")
      published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')
    except (IndexError, TypeError, ValueError):
      pass

  if not published_on:
    try:
      published_on_text = "/".join(re.search("(\w{3}).* (\d{4})", published_on_text).groups())
      published_on = datetime.datetime.strptime(published_on_text, '%b/%Y')
    except AttributeError:
      pass

  if not published_on:
    admin.log_no_date("fca", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report_type_text = result.find_previous("p", class_="mainContentheader2").text.strip()
  report_type = type_for_report(report_type_text)

  report = {
    'inspector': 'fca',
    'inspector_url': 'https://www.fca.gov/home/inspector.html',
    'agency': 'fca',
    'agency_name': 'Farm Credit Administration',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Example #43
0
def report_from(result, landing_url, report_type, year_range):
    td = result.select("td")[1]
    link = td.a

    if link:
        title = re.sub("\\s+", " ", link.text.strip())
        unreleased = False
        report_url = urljoin(landing_url, link.get('href'))
        report_filename = report_url.split("/")[-1]
        report_filename = IE_DOWNLOAD_SUFFIX_RE.sub("", report_filename)
        report_id, _ = os.path.splitext(report_filename)
    else:
        title = re.sub("\\s+", " ", td.text.strip())
        title = title.replace(" (Unavailable)", "")
        unreleased = True
        report_url = None

    published_on = None
    published_on_match = DATE_RE.search(td.text)
    if published_on_match:
        published_on_text = published_on_match.group(1)
        published_on = datetime.datetime.strptime(published_on_text,
                                                  "%m/%d/%Y")

    if published_on is None and link is not None:
        sar_match = SAR_RE.search(link.text)
        if sar_match:
            published_on = datetime.datetime.strptime(sar_match.group(1),
                                                      "%B %Y")
        else:
            if report_id in REPORT_ID_PUBLISHED_MAP:
                published_on = REPORT_ID_PUBLISHED_MAP[report_id]

    if link is None and published_on is None:
        if title in REPORT_TITLE_PUBLISHED_MAP:
            published_on = REPORT_TITLE_PUBLISHED_MAP[title]
        else:
            admin.log_no_date("rrb", "?", title)
            return

    if link is None:
        report_id = "{}-{}".format(published_on.strftime("%m-%d-%y"),
                                   "-".join(title.split()))[:50]

    if published_on is None:
        admin.log_no_date("rrb", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % title)
        return

    report = {
        'inspector': 'rrb',
        'inspector_url': "http://www.rrb.gov/oig/",
        'agency': 'rrb',
        'agency_name': "Railroad Retirement Board",
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if unreleased:
        report['unreleased'] = unreleased
        report['landing_url'] = landing_url
    return report
Example #44
0
def report_from(result, landing_url, report_type, year_range):
  if not result.text or result.text in BLACKLIST_REPORT_TITLES:
    # There are a few empty links due to bad html and some links for alternative
    # formats (PDF) that we will just ignore.
    return

  link_text = None
  if result.name == 'a':
    report_url = result.get('href')
    link_text = inspector.sanitize(result.text)
    title = inspector.sanitize("%s %s" % (result.text, result.next_sibling))
  else:
    links = [link for link in result.find_all('a') if link.text.strip()]
    report_url = links[0].get('href')
    link_text = inspector.sanitize(result.a.text)
    title = inspector.sanitize(result.text)
  report_url = urljoin(landing_url, report_url)
  report_filename = os.path.basename(report_url)

  if title.endswith("PDF"):
    title = title[:-3]
  title = title.rstrip(" .")

  prev = result.previous_sibling
  if isinstance(prev, NavigableString) and "See, also:" in prev:
    return None

  report_no_match = REPORT_NO_RE.match(link_text)
  if report_no_match:
    report_id = report_no_match.group(0)
    if "fraud" in report_url.lower():
      report_id = "fraud-alert-" + report_id
    elif "Client_Trust_Fund" in report_url:
      report_id = "CTF-" + report_id
    elif report_filename.startswith("sr"):
      report_id = "special-report-" + report_id
  else:
    report_id, _ = os.path.splitext(report_filename)
    report_id = unquote(report_id)
  report_id = "-".join(report_id.split())
  report_id = report_id.replace("\\", "")  # strip backslashes

  estimated_date = False
  published_on = None
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]
  elif link_text == "June 2015":
    published_on = datetime.datetime(2015, 6, 1)
  else:
    published_on_text = None
    try:
      published_on_text = re.search('(\d+/\d+/\d+)', title).groups()[0]
    except AttributeError:
      pass
    if not published_on_text:
      try:
        published_on_text = re.search('(\w+ \d+, \d+)', title).groups()[0]
      except AttributeError:
        pass
    if not published_on_text:
      try:
        published_on_text = re.search('(\d+/\d+)', title).groups()[0]
      except AttributeError:
        pass

    if not published_on_text:
      admin.log_no_date("lsc", report_id, title, report_url)
      return

    if not published_on:
      datetime_formats = [
        '%B %d, %Y',
        '%m/%d/%Y',
        '%m/%d/%y',
        '%m/%Y',
        '%m/%y'
      ]
      for datetime_format in datetime_formats:
        try:
          published_on = datetime.datetime.strptime(published_on_text, datetime_format)
        except ValueError:
          pass
        else:
          break

  if not published_on:
    admin.log_no_date("lsc", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'lsc',
    'inspector_url': 'https://www.oig.lsc.gov',
    'agency': 'lsc',
    'agency_name': 'Legal Services Corporation',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }

  if estimated_date:
    report['estimated_date'] = estimated_date

  if report_url in ("https://www.oig.lsc.gov/core-legal-services"):
    report['file_type'] = "html"

  if report_url.startswith("https://oig.lsc.gov/mapping/references/eval"):
    report['unreleased'] = True
    report['missing'] = True

  return report
Example #45
0
def run(options):
    year_range = inspector.year_range(options, archive)

    report_seen_flag = False
    for url in years_to_index_urls(year_range):
        index = utils.beautifulsoup_from_url(url)
        tables = index.find_all("table")
        lis = index.select("ul.field li")
        if len(tables) >= 1:
            table = tables[0]
            trs = table.select('tr')
            for tr in trs:
                tds = tr.select('td')
                if not tds:
                    continue
                if RE_YEAR.match(tds[0].text):
                    continue
                if "".join(td.text for td in tds).strip() == "":
                    continue
                report_seen_flag = True

                try:
                    published_on_dt = parse_date(tds[0].text.strip())
                except Exception:
                    pass
                if not published_on_dt:
                    try:
                        published_on_dt = parse_date(tds[2].text.strip())
                    except Exception:
                        pass
                if not published_on_dt:
                    admin.log_no_date("epa", tds[2].text, tds[1].text)
                    continue
                if published_on_dt.year not in year_range:
                    continue

                report = report_from_table(tds, published_on_dt, url)
                if report:
                    inspector.save_report(report)
        else:
            for li in lis:
                report_seen_flag = True
                date_match = RE_DATE.search(li.text)
                if date_match:
                    published_on_dt = parse_date(date_match.group())
                else:
                    href = urljoin(url, li.a["href"])
                    if href in REPORT_PUBLISHED_MAPPING:
                        published_on_dt = REPORT_PUBLISHED_MAPPING[href]
                if not published_on_dt:
                    admin.log_no_date("epa", extract_url(li), li.a.text, href)
                    continue
                if published_on_dt.year not in year_range:
                    continue

                report = report_from_list(li, published_on_dt, url)
                if report:
                    inspector.save_report(report)

        if not report_seen_flag:
            raise inspector.NoReportsFoundError("EPA")
Example #46
0
def report_from(result, landing_url, report_type, year_range):
    report_url = urljoin(landing_url, result.get('href'))
    report_url = report_url.replace("../", "")
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)
    try:
        title = result.parent.find("em").text
    except AttributeError:
        try:
            title = result.parent.contents[0].text
        except AttributeError:
            title = result.parent.contents[0]

    # There's a typo in the link for this report, it points to the wrong file
    if report_id == "Report14-28-TN-17163" and title.find(
            "Report on the Better Basics, Inc., Literacy Program for Clay, Jefferson"
    ) != -1:
        report_url = "http://www.arc.gov/images/aboutarc/members/IG/Report14-34-AL-17208-302-12.pdf"
        report_id = "Report14-34-AL-17208-302-12"

    published_on = None
    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]

    if not published_on:
        try:
            published_on_text = title.split("\u2013")[-1].strip()
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B %d, %Y')
        except ValueError:
            pass

    if not published_on:
        try:
            response = utils.scraper.request(method="HEAD", url=report_url)
            last_modified = response.headers["Last-Modified"]
            published_on = datetime.datetime.strptime(
                last_modified, "%a, %d %b %Y %H:%M:%S %Z")
        except ValueError:
            pass

    if not published_on:
        admin.log_no_date("arc", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'arc',
        'inspector_url': 'http://www.arc.gov/oig',
        'agency': 'arc',
        'agency_name': 'Appalachian Regional Commission',
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'type': report_type,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
def report_from(result, year_range):
    report_url = urljoin(RECENT_AUDITS_URL, result.get('href'))
    if report_url in URL_BLACKLIST:
        return None
    # Strip extra path adjustments
    report_url = report_url.replace("../", "")

    summary = None
    if not report_url.endswith(".pdf"):
        # Some reports link to other page which link to the full report
        report_page = utils.beautifulsoup_from_url(report_url)
        relative_report_url = report_page.select(
            "div.block a[href]")[0]['href']
        report_url = urljoin(report_url, relative_report_url)
        # Strip extra path adjustments
        report_url = report_url.replace("../", "")

        summary = "\n".join(
            paragraph.text
            for paragraph in report_page.select("div.grid_12 p"))

    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)

    if report_id in report_ids_seen:
        return
    report_ids_seen.add(report_id)

    title = result.text.strip()
    report_type = report_type_from_url(report_url)

    if not title:
        return None

    estimated_date = False
    published_on = None
    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]

    if not published_on:
        try:
            published_on_text = "/".join(
                re.search('(\w+) (\d+), (\d+)', title).groups())
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B/%d/%Y')
        except AttributeError:
            pass

    if not published_on:
        month_year_match = MONTH_YEAR_RE.search(result.text)
        if month_year_match:
            date_text = ' '.join(month_year_match.group(0).split())
            published_on = datetime.datetime.strptime(date_text, '%B %Y')
            estimated_date = True

    if not published_on:
        admin.log_no_date("smithsonian", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'smithsonian',
        'inspector_url': 'https://www.si.edu/OIG',
        'agency': 'smithsonian',
        'agency_name': 'Smithsonian Institution',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if summary:
        report['summary'] = summary
    if estimated_date:
        report['estimated_date'] = estimated_date
    return report
Example #48
0
def process_report(result, year_range):

  """Use the report ID obtained from HTML to hit GAO's API"""
  # <a href="/assets/690/685452.pdf">View Report (PDF, 8 pages)</a>
  # 685452 is the ID used by the API.

  # The link's path looks like "/products/GAO-17-558", use the last part
  # as the report ID
  landing_url = urljoin('https://www.gao.gov', result.a['href'])
  report_number = os.path.basename(result.a['href'])

  title = re.sub("\\s+", " ", result.span.text).strip()
  description = re.sub("\\s+", " ", result.p.text).strip()

  dates = result.find_all('span')[-1].string.replace('\n', '').split(': ')
  # ['Published', 'Mar 31, 1959. Publicly Released', 'Mar 31, 1959.']
  # Prefer the first, fall back to the latter if necessary--not sure it ever is
  published_on = parse_date(dates[1].split('.')[0].strip())
  if not published_on:
    published_on = parse_date(dates[-1].replace('.', '').strip())

  if not published_on:
    admin.log_no_date("gaoreports", report_number, title, landing_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % landing_url)
    return

  pdf_links = result.find_all('li', {'class': 'pdf-link'})
  (report_url, highlights_url, accessible_url) = (None, None, None)
  for link in pdf_links:
    if not link.a or link.a['href'] == '':
      continue
    if 'View Report' in link.a.string:
      report_url = urljoin('https://www.gao.gov', link.a['href'])
    if 'Highlights' in link.a.string:
      highlights_url = urljoin('https://www.gao.gov', link.a['href'])
    if 'Accessible' in link.a.string:
      accessible_url = urljoin('https://www.gao.gov', link.a['href'])
  # Last PDF is full report. First one could be Highlights.
  try:  # get the ID from one of the filenames, minus the extension
    api_id = os.path.splitext(os.path.basename(pdf_links[-1].a['href']))[0]
  except Exception:  # very old reports are sometimes different
    api_id = os.path.splitext(os.path.basename(result.a['href']))[0]
  api_id = api_id.lstrip('0')

  if not landing_url and not report_url:
    logging.debug("[%s] No landing URL or PDF, skipping..." % api_id)
    return None

  api_url = "http://www.gao.gov/api/id/%s" % api_id
  json_response = json.loads(utils.download(api_url))
  if not json_response:
    return None
  details = json_response[0]

  """looks like this {
    "youtube_id": null,
    "type": "reports",
    "content_id": "685451",
    "bucket_term": "Defense Management",
    "title": "DOD Has Taken Initial Steps to Formulate",
    "description": null,
    "rptno": "GAO-17-523R",
    "docdate": "2017-06-23",
    "actual_release_date": "2017-06-23T12:00:00Z",
    "actual_release_date_formatted": "Jun 23, 2017",
    "original_release_dt": null,
    "category_img": "http://www.gao.gov/images/rip/defense.jpg",
    "category_img_alt": "defense icon",
    "additional_links": "",
    "topics": [
    "National Defense"
    ],
    "subsite": [
    "Correspondence"
    ],
    "format": null,
    "mime_type_s": null,
    "ereport_flag": 0,
    "pdf_url": "http://www.gao.gov/assets/690/685452.pdf",
    "url": "http://www.gao.gov/products/GAO-17-523R",
    "document_type": "report",
    "supplement_url": null,
    "description_short": ""
    },"""

  if 'html_url' in details:
    accessible_url = details['html_url']
  categories = details.get('topics', None)
  if not categories:  # json could have null or []
      categories = []
  if details['bucket_term']:
    categories.append(details['bucket_term'])
  # defer to HTML instead of API for this stuff
  # published_on = details['docdate']
  # posted_at = details['actual_release_date'][:10]
  # title = details['title']
  # report_type = details['document_type']
  # if details.get('description', None):
  #   description = details['description']

  report = {
    'inspector': 'gaoreports',
    'inspector_url': 'https://www.gao.gov',
    # often GAO reports do focus on a program in a specific external agency,
    # but we're not attempting to discern it.
    # We'll just have GAO for the inspector and the agency.
    'agency': 'gao',
    'agency_name': 'Government Accountability Office',
    'report_id': report_number,
    'landing_url': landing_url,
    'url': report_url,
    'title': title,
    'type': details['document_type'],
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),

    'highlights_url': highlights_url,
    'accessible_url': accessible_url,
    'description': description,
    'categories': categories,
    'category_img': details['category_img'],
    'category_img_alt': details['category_img_alt'],
    'subsite': details['subsite']
  }

  if not report_url:
    report['unreleased'] = True

  return report
Example #49
0
def report_from(result, landing_url, report_type, year_range):
    link = result.find("a")
    if not link:
        return

    title = link.text
    report_url = urljoin(landing_url, link.get('href'))
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)

    published_on = None
    try:
        published_on_text = result.select("td")[1].text.strip()
        published_on = datetime.datetime.strptime(published_on_text,
                                                  '%m/%d/%y')
    except (ValueError, IndexError):
        pass

    try:
        published_on_text = result.select("td")[1].text.strip()
        published_on = datetime.datetime.strptime(published_on_text,
                                                  '%m/%d/%Y')
    except (ValueError, IndexError):
        pass

    if not published_on:
        try:
            published_on_text = title.split("-")[-1].split("–")[-1].strip()
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B %d, %Y')
        except ValueError:
            pass

    if not published_on:
        if report_id in REPORT_PUBLISHED_MAP:
            published_on = REPORT_PUBLISHED_MAP[report_id]

    if not published_on:
        admin.log_no_date("nea", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'nea',
        'inspector_url': 'http://arts.gov/oig',
        'agency': 'nea',
        'agency_name': 'National Endowment for the Arts',
        'type': report_type,
        'landing_url': landing_url,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if report_id in MISSING_IDS:
        report['unreleased'] = True
        report['missing'] = True
        report['url'] = None
    return report
def report_from(result, page_url, report_type, year_range):
  try:
    title, date1, date2 = result.text.rsplit(",", 2)
    published_on_text = date1 + date2
    published_on = datetime.datetime.strptime(published_on_text.strip(), '%B %d %Y')
  except ValueError:
    try:
      title, date1, date2, date3 = result.text.rsplit(maxsplit=3)
      published_on_text = date1 + date2 + date3
      published_on = datetime.datetime.strptime(published_on_text.strip(), '%B%d,%Y')
    except ValueError:
      title = result.text
      published_on = None

  title = clean_text(title)
  original_title = title
  report_id, title = title.split(maxsplit=1)
  report_id = report_id.rstrip(":")
  if result.name == "a":
    link = result
  else:
    link = result.a

  report_url = urljoin(page_url, link['href'])

  # HTTPS, even if they haven't updated their links yet
  report_url = re.sub("^http://www.treasury.gov", "https://www.treasury.gov", report_url)

  if report_id.find('-') == -1:
    # If the first word of the text doesn't contain a hyphen,
    # then it's probably part of the title, and not a tracking number.
    # In this case, fall back to the URL.
    report_filename = report_url.split("/")[-1]
    report_id, extension = os.path.splitext(report_filename)
    report_id = unquote(report_id)

    # Reset the title, since we previously stripped off the first word
    # as a candidate report_id.
    title = original_title

  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]

  if not published_on:
    admin.log_no_date("treasury", report_id, title, report_url)
    return

  # Skip this report, it already shows up under other audit reports
  if report_id == "Role of Non-Career Officials in Treasury FOIA Processing":
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'treasury',
    'inspector_url': 'https://www.treasury.gov/about/organizational-structure/ig/',
    'agency': 'treasury',
    'agency_name': "Department of the Treasury",
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
def audit_report_from(result, landing_url, year, year_range):
    if not result.text.strip():
        return
    link = result.find("a")

    report_url = urljoin(landing_url, link['href'])
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)

    try:
        title = result.select("blockquote")[0].contents[0]
    except IndexError:
        title = result.text

    title_prefixer = re.compile(
        "(Advisory|Management|Audit)\\s*(Letter|Report)\\s*[\\d\\-]+:\\s*",
        re.I)
    title = title_prefixer.sub("", title)

    estimated_date = False
    published_on = None

    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]

    cleaned_text = re.sub("\s+", " ", inspector.sanitize(result.text))
    if not published_on:
        try:
            published_on_text = re.search('(\w+ \d+, \d+)',
                                          cleaned_text).groups()[0]
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B %d, %Y')
        except AttributeError:
            pass

    if not published_on:
        try:
            published_on_text = re.search('(\w+ \d+ , \d+)',
                                          cleaned_text).groups()[0]
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B %d , %Y')
        except AttributeError:
            pass

    if not published_on:
        try:
            response = utils.scraper.request(method="HEAD", url=report_url)
            last_modified = response.headers["Last-Modified"]
            published_on = datetime.datetime.strptime(
                last_modified, "%a, %d %b %Y %H:%M:%S %Z")
        except ValueError:
            pass

    if not published_on:
        admin.log_no_date("archives", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'archives',
        'inspector_url': 'https://www.archives.gov/oig/',
        'agency': 'archives',
        'agency_name': 'National Archives and Records Administration',
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'type': 'audit',
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if estimated_date:
        report['estimated_date'] = estimated_date
    return report
def audit_report_from(result, page_url, year_range):
  if not clean_text(result.text):
    # Empty row
    return

  # Get all direct child nodes
  children = list(result.find_all(True, recursive=False))
  published_on_text = clean_text(children[1].text)

  # this is the header row
  if published_on_text.strip() == "Date":
    return None

  date_formats = ['%m/%d/%Y', '%m/%d%Y', '%m/%d/%y']
  published_on = None
  for date_format in date_formats:
    try:
      published_on = datetime.datetime.strptime(published_on_text, date_format)
    except ValueError:
      pass

  report_summary = clean_text(children[2].text)
  if not report_summary:
    # There is an extra row that we want to skip
    return

  report_summary = report_summary.replace("OIG-15-38Administrative",
                                          "OIG-15-38 Administrative")
  summary_match = SUMMARY_RE.match(report_summary)
  summary_match_2 = SUMMARY_FALLBACK_RE.match(report_summary)
  if summary_match:
    report_id = summary_match.expand(r"\1-\2-\3")
    title = summary_match.group(4)
  elif summary_match_2:
    report_id = summary_match_2.expand(r"(\2-\1-\3")
    title = summary_match_2.group(4)
  elif report_summary.startswith("IGATI") and published_on is not None:
    # There are two such annual reports from different years, append the year
    report_id = "IGATI %d" % published_on.year
    title = report_summary
  elif report_summary == "Report on the Bureau of the Fiscal Service Federal " \
          "Investments Branch\u2019s Description of its Investment/" \
          "Redemption Services and the Suitability of the Design and Operating " \
          "Effectiveness of its Controls for the Period August 1, 2013 to " \
          "July 31, 2014":
    # This one is missing its ID in the index
    report_id = "OIG-14-049"
    title = report_summary
  elif report_summary == "Correspondence related to the resolution of audit recommendation 1 OIG-16-001 OFAC Libyan Sanctions Case Study (Please read this correspondence in conjunction with the report.)":
    # Need to make up a report_id for this supplemental document
    report_id = "OIG-16-001-resolution"
    title = report_summary
  else:
    try:
      filename_match = FILENAME_RE.match(os.path.basename(result.a["href"]))
      report_id = filename_match.group(1)
      title = report_summary
    except (ValueError, IndexError, AttributeError):
      raise Exception("Couldn't parse report ID: %s" % repr(report_summary))

  if report_id == 'OIG-15-015' and \
          'Financial Statements for hte Fiscal Years 2014 and 2013' in title:
    # This report is listed twice, once with a typo
    return

  if report_id == 'OIG-07-003' and published_on_text == '11/23/2006':
    # This report is listed twice, once with the wrong date
    return

  # There are copy-paste errors with several retracted reports
  if report_id == 'OIG-14-037':
    if published_on.year == 2011 or published_on.year == 2010:
      return
  if report_id == 'OIG-13-021' and published_on_text == '12/12/2012':
    return

  if published_on is None:
    admin.log_no_date("treasury", report_id, title)
    return

  agency_slug_text = children[0].text

  if report_id in REPORT_AGENCY_MAP:
    agency_slug = REPORT_AGENCY_MAP[report_id]
  else:
    agency_slug = clean_text(agency_slug_text.split("&")[0]).lower()

  if (report_id in UNRELEASED_REPORTS or
          "If you would like a copy of this report" in report_summary or
          "If you would like to see a copy of this report" in report_summary or
          "have been removed from the OIG website" in report_summary or
          "removed the auditors\u2019 reports from the" in report_summary or
          "Classified Report" in report_summary or
          "Classified Audit Report" in report_summary or
          "Sensitive But Unclassified" in report_summary or
          "To obtain further information, please contact the OIG" in report_summary or
          "Report is under compliance review" in report_summary):
    unreleased = True
    report_url = None
    landing_url = page_url
  else:
    link = result.select("a")[0]
    report_url = urljoin(AUDIT_REPORTS_BASE_URL, link['href'])
    if report_url == AUDIT_REPORTS_BASE_URL:
      raise Exception("Invalid link found: %s" % link)
    unreleased = False
    landing_url = None

  # HTTPS, even if they haven't updated their links yet
  if report_url is not None:
    report_url = re.sub("^http://www.treasury.gov", "https://www.treasury.gov", report_url)

  if report_url == "https://www.treasury.gov/about/organizational-structure/ig/Documents/OIG-11-071.pdf":
    report_url = "https://www.treasury.gov/about/organizational-structure/ig/Documents/OIG11071.pdf"

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'treasury',
    'inspector_url': 'https://www.treasury.gov/about/organizational-structure/ig/',
    'agency': agency_slug,
    'agency_name': AGENCY_NAMES[agency_slug],
    'type': 'audit',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if unreleased:
    report['unreleased'] = unreleased
  if landing_url:
    report['landing_url'] = landing_url

  return report
Example #53
0
def report_from(result, landing_url, report_type, year_range):
    link = result.find("a")

    report_url = urljoin(landing_url, link.get('href').strip())
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)

    title = link.text

    file_type = None
    unreleased = False
    if "Non Public Report" in title.replace(
            "-", " "):  # Normalize title for easier detection
        unreleased = True
        landing_url = report_url
        report_url = None
    elif not report_url.endswith(".pdf"):
        # A link to an html report
        file_type = "html"

    estimated_date = False
    published_on = None
    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]

    if not published_on:
        if not os.path.splitext(report_filename)[1]:
            report_doc = utils.beautifulsoup_from_url(report_url)
            if report_doc:
                time_tag = report_doc.time
                if time_tag:
                    date = report_doc.time["datetime"]
                    published_on = datetime.datetime.strptime(
                        date, "%Y-%m-%d %H:%M:%S")

    if not published_on:
        if landing_url == SEMIANNUAL_REPORTS_URL:
            fy_match = re.match("Fiscal Year ([0-9]{4})", title)
            if fy_match:
                year = int(fy_match.group(1))
                if "(First Half)" in title:
                    published_on = datetime.datetime(year, 3, 31)
                    estimated_date = True
                elif "(Second Half)" in title:
                    published_on = datetime.datetime(year, 9, 30)
                    estimated_date = True

    if not published_on:
        admin.log_no_date("ftc", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'ftc',
        'inspector_url':
        "https://www.ftc.gov/about-ftc/office-inspector-general",
        'agency': 'ftc',
        'agency_name': "Federal Trade Commission",
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if estimated_date:
        report['estimated_date'] = estimated_date
    if unreleased:
        report['unreleased'] = unreleased
        report['landing_url'] = landing_url
    if file_type:
        report['file_type'] = file_type
    return report
def rss_report_from(result, year_range):
  report_url = result.find("link").next_sibling.strip()
  if report_url.rstrip("/") == 'http://www.si.edu/oig':
    # This is the default url the IG uses for announcements of things like
    # a website redesign or changes to the RSS feed.
    return

  if report_url == "http://www.si.edu/oig/OIGStratPlan.pdf":
    # This strategic plan is no longer on the website, but it is reproduced in
    # multiple semiannual reports, so we skip it here.
    return

  if report_url in RSS_BROKEN_LINKS:
    report_url = RSS_BROKEN_LINKS[report_url]
  else:
    report_url = report_url.replace("/OIG/SAR/Semiannual_Reports/", "/OIG/SAR/")
    report_url = report_url.replace("/oig/Semiannual_Reports/", "/Content/OIG/SAR/")
    report_url = report_url.replace("/oig/AuditReports/", "/Content/OIG/Audits/")
    report_url = report_url.replace("/oig/ARRA_Reports/", "/Content/OIG/Audits/")

  file_type = None
  if not report_url.endswith(".pdf"):
    file_type = "html"

  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  if report_id in report_ids_seen:
    return
  report_ids_seen.add(report_id)

  title = result.find("title").text
  report_type = report_type_from_url(report_url)

  published_on = None
  published_on_text = result.find("pubdate").text
  try:
    published_on = datetime.datetime.strptime(published_on_text, '%a, %d %b %Y %H:%M:%S %z').date()
  except ValueError:
    pass

  if not published_on:
    try:
      published_on = datetime.datetime.strptime(published_on_text, '%a, %d %B %Y %H:%M:%S %z').date()
    except ValueError:
      pass

  if not published_on:
    admin.log_no_date("smithsonian", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'smithsonian',
    'inspector_url': 'https://www.si.edu/OIG',
    'agency': 'smithsonian',
    'agency_name': 'Smithsonian Institution',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if file_type:
    report['file_type'] = file_type
  return report
Example #55
0
def report_from(result, landing_url, year_range):
  report_url = urljoin(landing_url, result.get('href'))

  # HTTPS, even if they haven't updated their links yet
  report_url = re.sub("^http://www.fca.gov", "https://www.fca.gov", report_url)

  if landing_url + '#' in report_url:
    # These are just anchor links, skip them.
    return

  if result.find_parent("ul") and result.find_parent("ul").get('type') == 'disc':
    # These are just anchor links, skip them.
    return

  title = clean_text(result.text)

  if title == 'Inspector General Reports':
    # Just a return link to the main IG page
    return

  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  if report_url == "https://www.fca.gov/Download/InspectorGeneral/Inspectionrpts/TravelCardProgram.pdf":
    report_id = "TravelCardProgram-2017"
  if report_url == "https://www.fca.gov/Download/InspectorGeneral/Inspectionrpts/PurchaseCardProgram.pdf":
    report_id = "PurchaseCardProgram-2017"

  published_on = None
  if report_id in REPORT_PUBLISHED_MAPPING:
    published_on = REPORT_PUBLISHED_MAPPING[report_id]
  if not published_on:
    try:
      li = result.parent
      if li.name == "u":
        li = li.parent
      published_on_text = li.contents[1].lstrip(",").split("(")[0].strip()
    except (IndexError, TypeError):
      published_on_text = result.text.strip()
    published_on_text = clean_text(published_on_text)

    try:
      published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')
    except ValueError:
      pass

  if not published_on:
    try:
      published_on_text = li.contents[1].strip().lstrip("(").rstrip(")")
      published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')
    except (IndexError, TypeError, ValueError):
      pass

  if not published_on:
    try:
      published_on_text = "/".join(re.search("(\w{3}).* (\d{4})", published_on_text).groups())
      published_on = datetime.datetime.strptime(published_on_text, '%b/%Y')
    except AttributeError:
      pass

  if not published_on:
    admin.log_no_date("fca", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report_type_text = result.find_previous("p", class_="mainContentheader2").text.strip()
  report_type = type_for_report(report_type_text)

  report = {
    'inspector': 'fca',
    'inspector_url': 'https://www.fca.gov/home/inspector.html',
    'agency': 'fca',
    'agency_name': 'Farm Credit Administration',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
def report_from(result, year_range, report_type=None):
    if result.name == 'a':
        link = result
    else:
        link = result.select("a")[-1]

    href = link['href']
    href = href.replace("file://///cftc.gov/home/dc/MWOODLAND/Desktop/", "")
    report_url = urljoin(REPORTS_URL, href)
    report_filename = report_url.split("/")[-1]
    report_id, _ = os.path.splitext(report_filename)

    title = link.text

    published_on = None
    if report_id in REPORT_PUBLISHED_MAPPING:
        published_on = REPORT_PUBLISHED_MAPPING[report_id]
    if not published_on:
        try:
            published_on_text = "/".join(
                re.search("(\w+) (\d+), (\d+)", title).groups())
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B/%d/%Y')
        except AttributeError:
            pass
    if not published_on:
        try:
            published_on_text = "/".join(
                re.search("(\w+) (\d+), (\d+)",
                          str(link.next_sibling)).groups())
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%B/%d/%Y')
        except AttributeError:
            pass
    if not published_on:
        admin.log_no_date("cftc", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    if not report_type:
        report_type = extract_report_type(title)
    if not report_type:
        report_type = extract_report_type(result.find_previous("p").text)
    if not report_type:
        report_type = "other"

    report = {
        'inspector': 'cftc',
        'inspector_url':
        'http://www.cftc.gov/About/OfficeoftheInspectorGeneral/index.htm',
        'agency': 'cftc',
        'agency_name': 'Commodity Futures Trading Commission',
        'file_type': 'pdf',
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'type': report_type,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report