def extract_report(report): report_path = path_for(report, report['file_type']) real_report_path = os.path.abspath( os.path.expandvars(os.path.join(utils.data_dir(), report_path))) text_path = "%s.txt" % os.path.splitext(report_path)[0] real_text_path = os.path.abspath( os.path.expandvars(os.path.join(utils.data_dir(), text_path))) if os.path.exists(real_text_path): # This report has already had its text extracted return text_path file_type_lower = report['file_type'].lower() if file_type_lower == "pdf": utils.text_from_pdf(real_report_path, real_text_path) return text_path elif file_type_lower == "doc": utils.text_from_doc(real_report_path, real_text_path) return text_path elif file_type_lower in FILE_EXTENSIONS_HTML: utils.text_from_html(real_report_path, real_text_path) return text_path else: logging.warn("Unknown file type, don't know how to extract text!") return None
def extract_report(report): report_path = path_for(report, report['file_type']) real_report_path = os.path.abspath(os.path.expandvars(os.path.join(utils.data_dir(), report_path))) text_path = "%s.txt" % os.path.splitext(report_path)[0] real_text_path = os.path.abspath(os.path.expandvars(os.path.join(utils.data_dir(), text_path))) if os.path.exists(real_text_path): # This report has already had its text extracted return text_path file_type_lower = report['file_type'].lower() if file_type_lower == "pdf": if utils.check_pdf_decryption(real_report_path): real_decrypted_path = real_report_path[:-4] + ".decrypted.pdf" if os.path.isfile(real_decrypted_path) or utils.decrypt_pdf(real_report_path, real_decrypted_path): utils.text_from_pdf(real_decrypted_path, real_text_path) return text_path else: utils.text_from_pdf(real_report_path, real_text_path) return text_path elif file_type_lower == "doc": utils.text_from_doc(real_report_path, real_text_path) return text_path elif file_type_lower == "docx": utils.text_from_docx(real_report_path, real_text_path) return text_path elif file_type_lower in FILE_EXTENSIONS_HTML: utils.text_from_html(real_report_path, real_text_path) return text_path else: logging.warn("Unknown file type, don't know how to extract text!") return None
def extract_metadata(report): report_path = path_for(report, report['file_type']) file_type_lower = report['file_type'].lower() if file_type_lower == "pdf": real_report_path = os.path.abspath( os.path.expandvars(os.path.join(utils.data_dir(), report_path))) if utils.check_pdf_decryption(real_report_path): real_decrypted_path = real_report_path[:-4] + ".decrypted.pdf" decrypted_path = report_path[:-4] + ".decrypted.pdf" if os.path.isfile(real_decrypted_path) or utils.decrypt_pdf( real_report_path, real_decrypted_path): metadata = utils.metadata_from_pdf(decrypted_path) else: metadata = None else: metadata = utils.metadata_from_pdf(report_path) if metadata: report['pdf'] = metadata return metadata elif file_type_lower == "doc": metadata = utils.metadata_from_doc(report_path) if metadata: report['doc'] = metadata return metadata elif file_type_lower == "docx": metadata = utils.metadata_from_docx(report_path) if metadata: report['docx'] = metadata return metadata elif file_type_lower in FILE_EXTENSIONS_HTML: return None else: logging.warn("Unknown file type, don't know how to extract metadata!") return None
def scan_disk(self, inspector, scraper): self.disk[inspector] = {} data_dir = utils.data_dir() inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for year_folder in os.listdir(inspector_path): year_disk = int(year_folder) year_path = os.path.join(inspector_path, year_folder) if os.path.isdir(year_path): for report_id_disk in os.listdir(year_path): report_path = os.path.join(year_path, report_id_disk) report_id_disk = CaseInsensitiveString(report_id_disk) if os.path.isdir(report_path): if report_id_disk in self.disk[inspector]: year_last = self.disk[inspector][ report_id_disk] msg = "[%s] Duplicate report_id: %s is saved under %d and %d" %\ (inspector, report_id_disk, year_last, year_disk) print(msg) admin.log_duplicate_id(inspector, report_id_disk, msg) self.disk[inspector][report_id_disk] = year_disk
def extract_metadata(report): report_path = path_for(report, report['file_type']) file_type_lower = report['file_type'].lower() if file_type_lower == "pdf": real_report_path = os.path.abspath(os.path.expandvars(os.path.join(utils.data_dir(), report_path))) if utils.check_pdf_decryption(real_report_path): real_decrypted_path = real_report_path[:-4] + ".decrypted.pdf" decrypted_path = report_path[:-4] + ".decrypted.pdf" if os.path.isfile(real_decrypted_path) or utils.decrypt_pdf(real_report_path, real_decrypted_path): metadata = utils.metadata_from_pdf(decrypted_path) else: metadata = None else: metadata = utils.metadata_from_pdf(report_path) if metadata: report['pdf'] = metadata return metadata elif file_type_lower == "doc": metadata = utils.metadata_from_doc(report_path) if metadata: report['doc'] = metadata return metadata elif file_type_lower == "docx": metadata = utils.metadata_from_docx(report_path) if metadata: report['docx'] = metadata return metadata elif file_type_lower in FILE_EXTENSIONS_HTML: return None else: logging.warn("Unknown file type, don't know how to extract metadata!") return None
def write_report(report): data_path = path_for(report, "json") utils.write( utils.json_for(report), os.path.join(utils.data_dir(), data_path) ) return data_path
def write_report(report): data_path = path_for(report, "json") utils.write( utils.json_for(report), "%s/%s" % (utils.data_dir(), data_path) ) return data_path
def download_report(report): report_path = path_for(report, report["file_type"]) binary = report["file_type"].lower() == "pdf" result = utils.download(report["url"], os.path.join(utils.data_dir(), report_path), {"binary": binary}) if result: return report_path else: return None
def download_report(report): report_path = path_for(report, report['file_type']) binary = (report['file_type'].lower() in ('pdf', 'doc', 'ppt')) result = utils.download(report['url'], os.path.join(utils.data_dir(), report_path), {'binary': binary}) if result: return report_path else: return None
def check_uniqueness(inspector, report_id, report_year): '''Given the name of an inspector, the ID of a report, and the year of the report, this function will check whether a duplicate report_id exists on-disk under a different year, or whether a duplicate report_id has been saved this session, in the same year or any other year. The index of reports already saved is lazily built on the first call from each inspector. Duplicate reports detected here will be collected, and a summary will be sent via admin.notify().''' # Be conservative, don't allow report_id to only differ in case report_id = report_id.lower() # Lazily set up data structures and read existing IDs from disk if inspector not in _uniqueness_storage_runtime: _uniqueness_storage_runtime[inspector] = set() if inspector not in _uniqueness_storage_disk: _uniqueness_storage_disk[inspector] = {} data_dir = utils.data_dir() inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for year_folder in os.listdir(inspector_path): year_disk = int(year_folder) year_path = os.path.join(inspector_path, year_folder) if os.path.isdir(year_path): for report_id_disk in os.listdir(year_path): report_path = os.path.join(year_path, report_id_disk) if os.path.isdir(report_path): if report_id_disk in _uniqueness_storage_disk[ inspector]: msg = "[%s] Duplicate report_id: %s is saved under %d and %d" %\ (inspector, report_id_disk, _uniqueness_storage_disk[inspector][report_id_disk], year_disk) print(msg) _uniqueness_messages.append(msg) _uniqueness_storage_disk[inspector][ report_id_disk] = year_disk if report_id in _uniqueness_storage_runtime[inspector]: msg = "[%s] Duplicate report_id: %s has been used twice this session" % \ (inspector, report_id) print(msg) _uniqueness_messages.append(msg) elif report_id in _uniqueness_storage_disk[inspector]: if report_year != _uniqueness_storage_disk[inspector][report_id]: msg = "[%s] Duplicate report_id: %s is saved under %d and %d" % \ (inspector, report_id, _uniqueness_storage_disk[inspector][report_id], report_year) print(msg) _uniqueness_messages.append(msg) _uniqueness_storage_runtime[inspector].add(report_id)
def check_uniqueness(inspector, report_id, report_year): '''Given the name of an inspector, the ID of a report, and the year of the report, this function will check whether a duplicate report_id exists on-disk under a different year, or whether a duplicate report_id has been saved this session, in the same year or any other year. The index of reports already saved is lazily built on the first call from each inspector. Duplicate reports detected here will be collected, and a summary will be sent via admin.notify().''' # Be conservative, don't allow report_id to only differ in case report_id = report_id.lower() # Lazily set up data structures and read existing IDs from disk if inspector not in _uniqueness_storage_runtime: _uniqueness_storage_runtime[inspector] = set() if inspector not in _uniqueness_storage_disk: _uniqueness_storage_disk[inspector] = {} data_dir = utils.data_dir() inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for year_folder in os.listdir(inspector_path): year_disk = int(year_folder) year_path = os.path.join(inspector_path, year_folder) if os.path.isdir(year_path): for report_id_disk in os.listdir(year_path): report_path = os.path.join(year_path, report_id_disk) if os.path.isdir(report_path): if report_id_disk in _uniqueness_storage_disk[inspector]: msg = "[%s] Duplicate report_id: %s is saved under %d and %d" %\ (inspector, report_id_disk, _uniqueness_storage_disk[inspector][report_id_disk], year_disk) print(msg) _uniqueness_messages.append(msg) _uniqueness_storage_disk[inspector][report_id_disk] = year_disk if report_id in _uniqueness_storage_runtime[inspector]: msg = "[%s] Duplicate report_id: %s has been used twice this session" % \ (inspector, report_id) print(msg) _uniqueness_messages.append(msg) elif report_id in _uniqueness_storage_disk[inspector]: if report_year != _uniqueness_storage_disk[inspector][report_id]: msg = "[%s] Duplicate report_id: %s is saved under %d and %d" % \ (inspector, report_id, _uniqueness_storage_disk[inspector][report_id], report_year) print(msg) _uniqueness_messages.append(msg) _uniqueness_storage_runtime[inspector].add(report_id)
def download_report(report): report_path = path_for(report, report['file_type']) binary = (report['file_type'].lower() in ('pdf', 'doc', 'ppt')) result = utils.download( report['url'], os.path.join(utils.data_dir(), report_path), {'binary': binary} ) if result: return report_path else: return None
def download_report(report, caller_scraper=None): report_path = path_for(report, report['file_type']) binary = (report['file_type'].lower() in ('pdf', 'doc', 'ppt', 'docx', 'xls')) result = utils.download(report['url'], os.path.join(utils.data_dir(), report_path), {'binary': binary}, scraper_slug=caller_scraper) if result: return report_path else: return None
def download_report(report): report_path = path_for(report, report['file_type']) binary = (report['file_type'] == 'pdf') result = utils.download( report['url'], "%s/%s" % (utils.data_dir(), report_path), {'binary': binary} ) if result: return report_path else: return None
def download_report(report, caller_scraper=None): report_path = path_for(report, report['file_type']) binary = (report['file_type'].lower() in ('pdf', 'doc', 'ppt', 'docx', 'xls')) result = utils.download( report['url'], os.path.join(utils.data_dir(), report_path), {'binary': binary}, scraper_slug=caller_scraper ) if result: return report_path else: return None
def extract_metadata(report): report_path = path_for(report, report["file_type"]) real_report_path = os.path.join(utils.data_dir(), report_path) file_type_lower = report["file_type"].lower() if file_type_lower == "pdf": metadata = utils.metadata_from_pdf(report_path) if metadata: report["pdf"] = metadata return metadata elif file_type_lower == "htm" or file_type_lower == "html": return None else: logging.warn("Unknown file type, don't know how to extract metadata!") return None
def extract_metadata(report): report_path = path_for(report, report['file_type']) real_report_path = os.path.join(utils.data_dir(), report_path) file_type_lower = report['file_type'].lower() if file_type_lower == "pdf": metadata = utils.metadata_from_pdf(report_path) if metadata: report['pdf'] = metadata return metadata elif file_type_lower in FILE_EXTENSIONS_HTML: return None else: logging.warn("Unknown file type, don't know how to extract metadata!") return None
def scan_disk(self, inspector, scraper): self.disk[inspector] = {} data_dir = utils.data_dir() inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for year_folder in os.listdir(inspector_path): year_disk = int(year_folder) year_path = os.path.join(inspector_path, year_folder) if os.path.isdir(year_path): for report_id_disk in os.listdir(year_path): report_path = os.path.join(year_path, report_id_disk) report_id_disk = CaseInsensitiveString(report_id_disk) if os.path.isdir(report_path): if report_id_disk in self.disk[inspector]: year_last = self.disk[inspector][report_id_disk] msg = "[%s] Duplicate report_id: %s is saved under %d and %d" %\ (inspector, report_id_disk, year_last, year_disk) print(msg) admin.log_duplicate_id(inspector, report_id_disk, msg) self.disk[inspector][report_id_disk] = year_disk
def report_from(tds, options): report = { 'inspector': 'dod', 'inspector_url': 'http://www.dodig.mil/', 'agency': 'dod', 'agency_name': 'Department of Defense', } title_link = tds[2].select('a')[0] title = title_link.text.strip().replace('\r\n', ' ') landing_url = urljoin(BASE_URL, title_link['href']) if landing_url in LANDING_PAGE_BLACKLIST: return published_date = datetime.datetime.strptime(tds[0].text.strip(), '%m-%d-%Y') published_on = published_date.strftime('%Y-%m-%d') topic = tds[1].text report_id = tds[2].select('strong') if report_id: report_id = report_id[0].text.strip() else: title_slug = re.sub(r'\W', '', title[:16]) report_id = (published_on + '-' + title_slug) # helper: use --report_id to skip all but that one only_id = options.get('report_id') if only_id and (only_id != report_id): return # helper: use --skip_downloaded to skip reports whose PDFs are on disk # (drastically reduces calls to DOD landing pages) if options.get('skip_downloaded'): pdf_path = inspector.path_for({ 'report_id': report_id, 'year': str(published_date.year), 'inspector': 'dod' }, 'pdf') if os.path.exists(os.path.join(utils.data_dir(), pdf_path)): logging.warn("\tSkipping previously downloaded report, as asked.") return report_url, summary, maybe_unreleased, skip = fetch_from_landing_page(landing_url) if skip: return if (report_url is None) and maybe_unreleased: report['unreleased'] = True # broken reports: mark as unreleased, but also mark as broken # blacklisted reports, or, from now on, anything in 2001 and before # I'll investigate the batch of 'missing' later. if (report_url is None) and ((report_id in BLACKLIST) or (published_date.year <= 2001)): report['unreleased'] = True report['missing'] = True # giving up on any more Guam errors, we've caught as many cases # as we reasonably can, and there are Guam entries that aren't reports. elif (report_url is None) and (re.search("guam", landing_url)): return office = tds[3].text.strip() report.update({ 'report_id': report_id, 'url': report_url, 'landing_url': landing_url, 'summary': summary, 'title': title, 'topic': topic, 'office': office, 'published_on': published_on }) return report
def report_from(tds, options): report = { 'inspector': 'dod', 'inspector_url': 'http://www.dodig.mil/', 'agency': 'dod', 'agency_name': 'Department of Defense', } title_link = tds[2].select('a')[0] title = title_link.text.strip().replace('\r\n', ' ') landing_url = urljoin(BASE_URL, title_link['href']) published_date = datetime.datetime.strptime(tds[0].text.strip(), '%m-%d-%Y') published_on = published_date.strftime('%Y-%m-%d') topic = tds[1].text report_id = tds[2].select('strong') if report_id: report_id = report_id[0].text.strip() else: title_slug = re.sub(r'\W', '', title[:16]) report_id = (published_on + '-' + title_slug) # helper: use --report_id to skip all but that one only_id = options.get('report_id') if only_id and (only_id != report_id): return # helper: use --skip_downloaded to skip reports whose PDFs are on disk # (drastically reduces calls to DOD landing pages) if options.get('skip_downloaded'): pdf_path = inspector.path_for({ 'report_id': report_id, 'year': str(published_date.year), 'inspector': 'dod' }, 'pdf') if os.path.exists(os.path.join(utils.data_dir(), pdf_path)): logging.warn("\tSkipping previously downloaded report, as asked.") return report_url, summary, maybe_unreleased, skip = fetch_from_landing_page(landing_url) if skip: return if (report_url is None) and maybe_unreleased: report['unreleased'] = True # broken reports: mark as unreleased, but also mark as broken # blacklisted reports, or, from now on, anything in 2001 and before # I'll investigate the batch of 'missing' later. if (report_url is None) and ((report_id in BLACKLIST) or (published_date.year <= 2001)): report['unreleased'] = True report['missing'] = True # giving up on any more Guam errors, we've caught as many cases # as we reasonably can, and there are Guam entries that aren't reports. elif (report_url is None) and (re.search("guam", landing_url)): return office = tds[3].text.strip() report.update({ 'report_id': report_id, 'url': report_url, 'landing_url': landing_url, 'summary': summary, 'title': title, 'topic': topic, 'office': office, 'published_on': published_on }) return report
def write_report(report): data_path = path_for(report, "json") utils.write(utils.json_for(report), os.path.join(utils.data_dir(), data_path)) return data_path