Example #1
0
def report_from(result, landing_url, report_type, year_range):
    title = result.select("td")[-1].text

    if "contains sensitive information" in title:
        unreleased = True
        report_url = None
        report_id = inspector.slugify("-".join(title.split())[:100])
        it_controls_match = IT_CONTROLS_RE.match(title)
        if it_controls_match:
            report_id = "%s-%s" % (report_id, it_controls_match.group(1))
    else:
        unreleased = False
        link = result.find("a")
        report_id = inspector.slugify(link.text)
        report_url = urljoin(landing_url, link.get('href'))
        if landing_url == SEMIANNUAL_REPORTS_URL:
            if title.find("Transmittal Letter") != -1:
                report_id = report_id + "-transmittal"

    estimated_date = False
    try:
        published_on = datetime.datetime.strptime(report_id.strip(),
                                                  '%m.%d.%y')
    except ValueError:
        # For reports where we can only find the year, set them to Nov 1st of that year
        published_on_year_text = result.find_previous("th").text
        published_on_year = int(
            published_on_year_text.replace("Fiscal Year ", ""))
        published_on = datetime.datetime(published_on_year, 11, 1)
        estimated_date = True

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'gpo',
        'inspector_url': 'http://www.gpo.gov/oig/',
        'agency': 'gpo',
        'agency_name': 'Government Publishing Office',
        'file_type': 'pdf',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if estimated_date:
        report['estimated_date'] = estimated_date
    if unreleased:
        report['unreleased'] = unreleased
        report['landing_url'] = landing_url
    return report
Example #2
0
def report_from(result, landing_url, report_type, year_range):
  title = result.select("td")[-1].text

  if "contains sensitive information" in title:
    unreleased = True
    report_url = None
    report_id = inspector.slugify("-".join(title.split())[:100])
    it_controls_match = IT_CONTROLS_RE.match(title)
    if it_controls_match:
      report_id = "%s-%s" % (report_id, it_controls_match.group(1))
  else:
    unreleased = False
    link = result.find("a")
    report_id = inspector.slugify(link.text)
    report_url = urljoin(landing_url, link.get('href'))
    if landing_url == SEMIANNUAL_REPORTS_URL:
      if title.find("Transmittal Letter") != -1:
        report_id = report_id + "-transmittal"

  estimated_date = False
  try:
    published_on = datetime.datetime.strptime(report_id.strip(), '%m.%d.%y')
  except ValueError:
    # For reports where we can only find the year, set them to Nov 1st of that year
    published_on_year_text = result.find_previous("th").text
    published_on_year = int(published_on_year_text.replace("Fiscal Year ", ""))
    published_on = datetime.datetime(published_on_year, 11, 1)
    estimated_date = True

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'gpo',
    'inspector_url': 'http://www.gpo.gov/oig/',
    'agency': 'gpo',
    'agency_name': 'Government Publishing Office',
    'file_type': 'pdf',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if estimated_date:
    report['estimated_date'] = estimated_date
  if unreleased:
    report['unreleased'] = unreleased
    report['landing_url'] = landing_url
  return report
Example #3
0
def report_from(result, landing_url, report_type, year_range):
  title = result.select("td")[-1].text
  title = re.sub("\\s+", " ", title)

  report_id_match = REPORT_ID_RE.match(result.td.text.strip())
  if ("contains sensitive information" in title or
      "This correspondence will not be posted" in title or
      title == "Unscheduled and Unpaid Absenteeism in the Office of "
      "Plant Operations"):
    unreleased = True
    report_url = None
    if report_id_match:
      report_id = report_id_match.group(0)
    else:
      report_id = inspector.slugify("-".join(title.strip().split())[:100])
  else:
    unreleased = False
    link = result.find("a")
    report_id = inspector.slugify(link.text.strip())
    if link.get('href') == "#":
      unreleased = True
      report_url = None
    else:
      report_url = urljoin(landing_url, link.get('href'))
      if landing_url == SEMIANNUAL_REPORTS_URL:
        if title.find("Transmittal Letter") != -1:
          report_id = report_id + "-transmittal"

  published_on = None
  try:
    published_on = datetime.datetime.strptime(link.text.strip(), '%m.%d.%y')
  except (ValueError, UnboundLocalError):
    pass

  if not published_on:
    if report_url:
      date_match = DATE_RE.search(report_url)
      if date_match:
        date_text = date_match.group(1)
        published_on = datetime.datetime.strptime(date_text, "%m-%d-%y")

  if not published_on:
    if report_id in REPORT_PUBLISHED_MAP:
      published_on = REPORT_PUBLISHED_MAP[report_id]

  if not published_on:
    admin.log_no_date("gpo", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report = {
    'inspector': 'gpo',
    'inspector_url': 'http://www.gpo.gov/oig/',
    'agency': 'gpo',
    'agency_name': 'Government Publishing Office',
    'file_type': 'pdf',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  if unreleased:
    report['unreleased'] = unreleased
    report['landing_url'] = landing_url
  return report
def make_report_id(url):
    return inspector.slugify(
        url.replace('/PublicFiles/', '').replace('/publicfiles/',
                                                 '').replace('.pdf', ''))
Example #5
0
def make_report_id(url):
  return inspector.slugify(url.replace('/PublicFiles/', '').replace('/publicfiles/', '').replace('.pdf', ''))
def report_from(result, landing_url, report_type, year_range):
    title = result.select("td")[-1].text
    title = re.sub("\\s+", " ", title)

    report_id_match = REPORT_ID_RE.match(result.td.text.strip())
    if ("contains sensitive information" in title
            or "This correspondence will not be posted" in title
            or title in UNPUBLISHED_REPORT_TITLES):
        unreleased = True
        report_url = None
        if report_id_match:
            report_id = report_id_match.group(0)
        else:
            report_id = inspector.slugify("-".join(
                title.strip().split())[:100])
    else:
        unreleased = False
        link = result.find("a")
        report_id = inspector.slugify(link.text.strip())
        if link.get('href') == "#":
            unreleased = True
            report_url = None
        else:
            report_url = urljoin(landing_url, link.get('href'))
            if landing_url == SEMIANNUAL_REPORTS_URL:
                if title.find("Transmittal Letter") != -1:
                    report_id = report_id + "-transmittal"

    published_on = None
    try:
        published_on = datetime.datetime.strptime(link.text.strip(),
                                                  '%m.%d.%y')
    except (ValueError, UnboundLocalError):
        pass

    if not published_on:
        if report_url:
            date_match = DATE_RE.search(report_url)
            if date_match:
                date_text = date_match.group(1)
                published_on = datetime.datetime.strptime(
                    date_text, "%m-%d-%y")

    if not published_on:
        if report_id in REPORT_PUBLISHED_MAP:
            published_on = REPORT_PUBLISHED_MAP[report_id]

    if not published_on:
        admin.log_no_date("gpo", report_id, title, report_url)
        return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report = {
        'inspector': 'gpo',
        'inspector_url': 'http://www.gpo.gov/oig/',
        'agency': 'gpo',
        'agency_name': 'Government Publishing Office',
        'file_type': 'pdf',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if unreleased:
        report['unreleased'] = unreleased
        report['landing_url'] = landing_url
    return report
Example #7
0
def report_from(result, category_name, agency, year_range):

  #ignore if it's not in our agency string->slug mapping or if it's in our mapping and has null instead of a slug.
  #that means it doesn't come from an agency whose IG we track; it may be a document from a
  #local government, etc.
  if (category_name,agency) not in GOVATTIC_MAPPING_DICT or GOVATTIC_MAPPING_DICT[(category_name,agency)][-1]=='':
    return
  (ig_short,ig_url,ig_slug) = GOVATTIC_MAPPING_DICT[(category_name,agency)]

  a = result.find('a')
  if not a:
    #there's no link, so this must just be some explanatory text, such as the footer
    return
  report_url = a['href']

  #these will be stored in folders with documents scraped by the official IG scrapers, so
  #use the governmentattic url as slug to assure no conflict.
  report_id = inspector.slugify(report_url.replace('http://www.',''))

  title = remove_linebreaks(a.text).strip()
  if not title:
    return
  text = remove_linebreaks(result.text)
  datematch = DATE_RE.search(text)
  published_on = None
  datestring = None
  if report_id == "governmentattic.org-21docs-ComplaintsRcvdCFTC_CY2013-2014.pdf":
    if title == "Commodity Futures Trading Commission (CFTC)":
      # Copy-paste error, skip
      return
  if datematch:
    datestring = '-'.join(datematch.groups()) #'01-Mar-2015
    datestring = datestring.replace("-Sept-", "-Sep-")
    try:
      published_on = datetime.datetime.strptime(datestring, '%d-%b-%Y')
    except:
      published_on = None
    if not published_on:
      try:
        published_on = datetime.datetime.strptime(datestring, '%d-%B-%Y')
      except:
        published_on = None
  if not published_on:
    admin.log_no_date("governmentattic", report_id, title, report_url)
    return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  #ignore documents that are interesting FOIAs but are not IG reports.
  #if you want to scrape IG and agency documents, set IG_REPORTS_ONLY=False
  if IG_REPORTS_ONLY and 'OIG' not in title and 'inspector general' not in title.lower():
    logging.debug("[%s] Skipping, not an IG report." % title)
    return

  report = {
    'inspector': ig_slug,     # Store these with their natively-scraped counterparts, not in a govattic-specific place
    'inspector_url': ig_url,
    'agency': ig_slug,        # Agency and IG slug will be the same
    'agency_name': ig_short,  # Take short name of the IG as the agency name. I think this should work.
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'type': 'FOIA - GovernmentAttic.org', # Type of report (default to 'other')
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d") #date published to GovAttic, not released by IG
  }

  return report