def report_from(result, landing_url, report_type, year_range): title = result.select("td")[-1].text if "contains sensitive information" in title: unreleased = True report_url = None report_id = inspector.slugify("-".join(title.split())[:100]) it_controls_match = IT_CONTROLS_RE.match(title) if it_controls_match: report_id = "%s-%s" % (report_id, it_controls_match.group(1)) else: unreleased = False link = result.find("a") report_id = inspector.slugify(link.text) report_url = urljoin(landing_url, link.get('href')) if landing_url == SEMIANNUAL_REPORTS_URL: if title.find("Transmittal Letter") != -1: report_id = report_id + "-transmittal" estimated_date = False try: published_on = datetime.datetime.strptime(report_id.strip(), '%m.%d.%y') except ValueError: # For reports where we can only find the year, set them to Nov 1st of that year published_on_year_text = result.find_previous("th").text published_on_year = int( published_on_year_text.replace("Fiscal Year ", "")) published_on = datetime.datetime(published_on_year, 11, 1) estimated_date = True if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'gpo', 'inspector_url': 'http://www.gpo.gov/oig/', 'agency': 'gpo', 'agency_name': 'Government Publishing Office', 'file_type': 'pdf', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url return report
def report_from(result, landing_url, report_type, year_range): title = result.select("td")[-1].text if "contains sensitive information" in title: unreleased = True report_url = None report_id = inspector.slugify("-".join(title.split())[:100]) it_controls_match = IT_CONTROLS_RE.match(title) if it_controls_match: report_id = "%s-%s" % (report_id, it_controls_match.group(1)) else: unreleased = False link = result.find("a") report_id = inspector.slugify(link.text) report_url = urljoin(landing_url, link.get('href')) if landing_url == SEMIANNUAL_REPORTS_URL: if title.find("Transmittal Letter") != -1: report_id = report_id + "-transmittal" estimated_date = False try: published_on = datetime.datetime.strptime(report_id.strip(), '%m.%d.%y') except ValueError: # For reports where we can only find the year, set them to Nov 1st of that year published_on_year_text = result.find_previous("th").text published_on_year = int(published_on_year_text.replace("Fiscal Year ", "")) published_on = datetime.datetime(published_on_year, 11, 1) estimated_date = True if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'gpo', 'inspector_url': 'http://www.gpo.gov/oig/', 'agency': 'gpo', 'agency_name': 'Government Publishing Office', 'file_type': 'pdf', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url return report
def report_from(result, landing_url, report_type, year_range): title = result.select("td")[-1].text title = re.sub("\\s+", " ", title) report_id_match = REPORT_ID_RE.match(result.td.text.strip()) if ("contains sensitive information" in title or "This correspondence will not be posted" in title or title == "Unscheduled and Unpaid Absenteeism in the Office of " "Plant Operations"): unreleased = True report_url = None if report_id_match: report_id = report_id_match.group(0) else: report_id = inspector.slugify("-".join(title.strip().split())[:100]) else: unreleased = False link = result.find("a") report_id = inspector.slugify(link.text.strip()) if link.get('href') == "#": unreleased = True report_url = None else: report_url = urljoin(landing_url, link.get('href')) if landing_url == SEMIANNUAL_REPORTS_URL: if title.find("Transmittal Letter") != -1: report_id = report_id + "-transmittal" published_on = None try: published_on = datetime.datetime.strptime(link.text.strip(), '%m.%d.%y') except (ValueError, UnboundLocalError): pass if not published_on: if report_url: date_match = DATE_RE.search(report_url) if date_match: date_text = date_match.group(1) published_on = datetime.datetime.strptime(date_text, "%m-%d-%y") if not published_on: if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: admin.log_no_date("gpo", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'gpo', 'inspector_url': 'http://www.gpo.gov/oig/', 'agency': 'gpo', 'agency_name': 'Government Publishing Office', 'file_type': 'pdf', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url return report
def make_report_id(url): return inspector.slugify( url.replace('/PublicFiles/', '').replace('/publicfiles/', '').replace('.pdf', ''))
def make_report_id(url): return inspector.slugify(url.replace('/PublicFiles/', '').replace('/publicfiles/', '').replace('.pdf', ''))
def report_from(result, landing_url, report_type, year_range): title = result.select("td")[-1].text title = re.sub("\\s+", " ", title) report_id_match = REPORT_ID_RE.match(result.td.text.strip()) if ("contains sensitive information" in title or "This correspondence will not be posted" in title or title in UNPUBLISHED_REPORT_TITLES): unreleased = True report_url = None if report_id_match: report_id = report_id_match.group(0) else: report_id = inspector.slugify("-".join( title.strip().split())[:100]) else: unreleased = False link = result.find("a") report_id = inspector.slugify(link.text.strip()) if link.get('href') == "#": unreleased = True report_url = None else: report_url = urljoin(landing_url, link.get('href')) if landing_url == SEMIANNUAL_REPORTS_URL: if title.find("Transmittal Letter") != -1: report_id = report_id + "-transmittal" published_on = None try: published_on = datetime.datetime.strptime(link.text.strip(), '%m.%d.%y') except (ValueError, UnboundLocalError): pass if not published_on: if report_url: date_match = DATE_RE.search(report_url) if date_match: date_text = date_match.group(1) published_on = datetime.datetime.strptime( date_text, "%m-%d-%y") if not published_on: if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: admin.log_no_date("gpo", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'gpo', 'inspector_url': 'http://www.gpo.gov/oig/', 'agency': 'gpo', 'agency_name': 'Government Publishing Office', 'file_type': 'pdf', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url return report
def report_from(result, category_name, agency, year_range): #ignore if it's not in our agency string->slug mapping or if it's in our mapping and has null instead of a slug. #that means it doesn't come from an agency whose IG we track; it may be a document from a #local government, etc. if (category_name,agency) not in GOVATTIC_MAPPING_DICT or GOVATTIC_MAPPING_DICT[(category_name,agency)][-1]=='': return (ig_short,ig_url,ig_slug) = GOVATTIC_MAPPING_DICT[(category_name,agency)] a = result.find('a') if not a: #there's no link, so this must just be some explanatory text, such as the footer return report_url = a['href'] #these will be stored in folders with documents scraped by the official IG scrapers, so #use the governmentattic url as slug to assure no conflict. report_id = inspector.slugify(report_url.replace('http://www.','')) title = remove_linebreaks(a.text).strip() if not title: return text = remove_linebreaks(result.text) datematch = DATE_RE.search(text) published_on = None datestring = None if report_id == "governmentattic.org-21docs-ComplaintsRcvdCFTC_CY2013-2014.pdf": if title == "Commodity Futures Trading Commission (CFTC)": # Copy-paste error, skip return if datematch: datestring = '-'.join(datematch.groups()) #'01-Mar-2015 datestring = datestring.replace("-Sept-", "-Sep-") try: published_on = datetime.datetime.strptime(datestring, '%d-%b-%Y') except: published_on = None if not published_on: try: published_on = datetime.datetime.strptime(datestring, '%d-%B-%Y') except: published_on = None if not published_on: admin.log_no_date("governmentattic", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return #ignore documents that are interesting FOIAs but are not IG reports. #if you want to scrape IG and agency documents, set IG_REPORTS_ONLY=False if IG_REPORTS_ONLY and 'OIG' not in title and 'inspector general' not in title.lower(): logging.debug("[%s] Skipping, not an IG report." % title) return report = { 'inspector': ig_slug, # Store these with their natively-scraped counterparts, not in a govattic-specific place 'inspector_url': ig_url, 'agency': ig_slug, # Agency and IG slug will be the same 'agency_name': ig_short, # Take short name of the IG as the agency name. I think this should work. 'report_id': report_id, 'url': report_url, 'title': title, 'type': 'FOIA - GovernmentAttic.org', # Type of report (default to 'other') 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d") #date published to GovAttic, not released by IG } return report