def write_report(report): data_path = "%s/%s/%s/report.json" % (report['inspector'], report['year'], report['report_id']) utils.write( utils.json_for(report), "%s/%s" % (utils.data_dir(), data_path) ) return data_path
def run(options): # Input: A path to the root of a domain-scan output directory. scan_path = options.get("scan", ".") results_path = os.path.join(scan_path, "results") # Output: Where to put the post-processed results. output_path = options.get("output", ".") # Take the uswds scan data and cut it down to just the rows for # which the bad banner text is enabled, and only the data needed for it, # in JSON form. uswds_csv = os.path.join(results_path, "uswds.csv") # collect list of dicts to convert into JSON bad_banner = [] uswds_present = [] with open(uswds_csv, newline='') as csvfile: for dict_row in csv.DictReader(csvfile): has_bad_banner = utils.boolean_for(dict_row["USWDS Bad Banner Text"]) is_uswds_present = utils.boolean_for(dict_row["USWDS Present"]) if (has_bad_banner): bad_banner.append({ 'hostname': dict_row["Domain"], 'base_domain': dict_row["Base Domain"], 'scanned_url': dict_row["Scanned URL"] }) if (is_uswds_present): uswds_present.append({ 'hostname': dict_row["Domain"], 'base_domain': dict_row["Base Domain"], 'scanned_url': dict_row["Scanned URL"] }) # Save resulting JSON. bad_banner_data = utils.json_for(bad_banner) bad_banner_output = os.path.join(output_path, "bad_banner.json") utils.write(bad_banner_data, bad_banner_output) uswds_present_data = utils.json_for(uswds_present) uswds_present_output = os.path.join(output_path, "uswds_present.json") utils.write(uswds_present_data, uswds_present_output)
def save_meta_result(result): path = meta_path_for(result['type'], result['agency'], result['year'], result['id']) # for paged metadata, don't overwrite if we've got it already, # we don't keep anything that should change. if os.path.exists(path): logging.debug("[%s][%s] Knew about it, skipping." % (result['id'], result['type'])) else: logging.warn("[%s][%s] Newly discovered, saving metadata." % (result['id'], result['type'])) utils.write(utils.json_for(result), path)
def main(): args = docopt.docopt(__doc__, version='v0.0.1') utils.configure_logging(args['--debug']) out_file = args['--output'] # Read from a .csv, or allow domains on the command line. domains = [] if args['INPUT'][0].endswith(".csv"): domains = utils.load_domains(args['INPUT'][0]) else: domains = args['INPUT'] # If the user wants to sort them, sort them in place. if args['--sorted']: domains.sort() options = { 'user_agent': args['--user-agent'], 'timeout': args['--timeout'], 'preload_cache': args['--preload-cache'], 'cache': args['--cache'] } results = pshtt.inspect_domains(domains, options) # JSON can go to STDOUT, or to a file. if args['--json']: output = utils.json_for(results) if out_file is None: print(output) else: utils.write(output, out_file) logging.warn("Wrote results to %s." % out_file) # Markdwon can go to STDOUT, or to a file elif args['--markdown']: output = sys.stdout if out_file is not None: output = open(out_file, 'w') pshtt.md_for(results, output) if out_file is not None: output.close() # CSV always goes to a file. else: if args['--output'] is None: out_file = 'results.csv' pshtt.csv_for(results, out_file) logging.warn("Wrote results to %s." % out_file)
def scan(domain, options): logging.debug("[%s][pshtt]" % domain) # cache output from pshtt cache_pshtt = utils.cache_path(domain, "pshtt", ext="json") force = options.get("force", False) data = None if (force is False) and (os.path.exists(cache_pshtt)): logging.debug("\tCached.") raw = open(cache_pshtt).read() data = json.loads(raw) if (data.__class__ is dict) and data.get('invalid'): return None else: logging.debug("\t %s %s" % (command, domain)) raw = utils.scan([ command, domain, '--json', '--user-agent', '\"%s\"' % user_agent, '--timeout', str(timeout), '--preload-cache', preload_cache ]) if not raw: utils.write(utils.invalid({}), cache_pshtt) logging.warn("\tBad news scanning, sorry!") return None data = json.loads(raw) utils.write(utils.json_for(data), utils.cache_path(domain, "pshtt")) # pshtt scanner uses JSON arrays, even for single items data = data[0] row = [] for field in headers: value = data[field] # TODO: Fix this upstream if (field != "HSTS Header") and (field != "HSTS Max Age") and ( field != "Redirect To"): if value is None: value = False row.append(value) yield row
def create_preload_list(): preload_json = None if PRELOAD_CACHE and os.path.exists(PRELOAD_CACHE): logging.debug("Using cached Chrome preload list.") preload_json = json.loads(open(PRELOAD_CACHE).read()) else: logging.debug("Fetching Chrome preload list from source...") # Downloads the chromium preloaded domain list and sets it to a global set file_url = 'https://chromium.googlesource.com/chromium/src/net/+/master/http/transport_security_state_static.json?format=TEXT' # TODO: proper try/except around this network request request = requests.get(file_url) raw = request.content # To avoid parsing the contents of the file out of the source tree viewer's # HTML, we download it as a raw file. googlesource.com Base64-encodes the # file to avoid potential content injection issues, so we need to decode it # before using it. https://code.google.com/p/gitiles/issues/detail?id=7 raw = base64.b64decode(raw).decode('utf-8') # The .json file contains '//' comments, which are not actually valid JSON, # and confuse Python's JSON decoder. Begone, foul comments! raw = ''.join( [re.sub(r'^\s*//.*$', '', line) for line in raw.splitlines()]) preload_json = json.loads(raw) if PRELOAD_CACHE: logging.debug("Caching preload list at %s" % PRELOAD_CACHE) utils.write(utils.json_for(preload_json), PRELOAD_CACHE) # For our purposes, we only care about entries that includeSubDomains fully_preloaded = [] for entry in preload_json['entries']: if entry.get('include_subdomains', False) is True: fully_preloaded.append(entry['name']) return fully_preloaded
def do_document(result, page, options): if result.get('pdfLink') is None: print("\tERROR, no pdfLink for document.") return False document = clean_document(result) # can limit to a particular known document ID, for debugging limit_id = options.get('document_id') if limit_id and (limit_id != document['document_id']): print("\tSkipping, not requested.") return False # 1) write JSON to disk at predictable path json_path = path_for(page, document['document_id'], "json") utils.write(utils.json_for(document), json_path) # 2) download pdfLink (unless dry run) if options.get('dry_run') is None: print("\t%s" % document['document_id']) pdf_path = path_for(page, document['document_id'], document['file_type']) result = utils.download( document['url'], pdf_path, { 'binary': (document['file_type'].lower() == 'pdf'), 'cache': not (options.get('force', False)) } ) if result: utils.text_from_pdf(pdf_path) return True
def get_record(agency, year, doc_id, options): # meta_path = meta_path_for("record", agency, year, doc_id) json_path = data_path_for("record", agency, year, doc_id, "json") ## Special: resume mode # # since downloading documents requires re-downloading landing pages, # the only way to do a resumable mass download of docs is to # check for whether the doc has been downloaded yet. this, in turn, # requires checking for the presence of [doc_type].[file_type], and the # file_type needs to be loaded from [doc_type].json. # # So, if --resume is on, before re-downloading anything, check if we # have a parsed .json, and if so, load the file_type and check for # the doc itself. If present, return True and move on. if options.get("resume"): if os.path.exists(json_path): data = json.load(open(json_path)) # it's an unreleased doc, move on anyway if data["unreleased"]: logging.warn("[%s][%s][%s][%s] Unreleased, skipping." % ("record", agency, year, doc_id)) return True doc_path = data_path_for("record", agency, year, doc_id, data["file_type"]) if os.path.exists(doc_path): logging.warn("[%s][%s][%s][%s] Already done, skipping." % ("record", agency, year, doc_id)) return True logging.warn("[%s][%s][%s][%s] Getting record..." % ("record", agency, year, doc_id)) # meta = json.load(open(meta_path)) # download landing page for record url = "https://foiaonline.regulations.gov/foia/action/public/view/record?objectId=%s" % doc_id # save the landing page no matter what, but only use it as a cache # if we're skipping the docs (download links are ephemeral :( ) body = utils.download(url, cache_path_for("record", agency, year, doc_id), {'cache': options.get('skip_doc', False)} ) # assume released unreleased = False doc = BeautifulSoup(body) main = doc.select("#mainForm") if main: main = main[0] else: logging.warn("[%s][%s][%s][%s] Landing page is not available, skipping." % ("record", agency, year, doc_id)) return True # get some other metadata about the record headers = record_headers_from(doc) # now clear the labels so text can be more easily extracted for label in main.select("fieldset .formitem label"): label.extract() links = main.select("fieldset .formitem") # get the actual document download link/ID download_link = links[headers["title"]].select("a") if len(download_link) > 0: download_url = download_link[0]['href'] download_url = "https://foiaonline.regulations.gov" + download_url # no link means it's not released else: unreleased = True title = links[headers["title"]].text.strip() author = links[headers["author"]].text.strip() if author == "N/A": author = None released_date = links[headers["released_on"]].text.strip() if released_date == "N/A": released_on = None else: try: released_at = parse(released_date) released_on = released_at.strftime("%Y-%m-%d") except TypeError: released_on = None request_id = links[headers["request"]].text.strip() file_type = links[headers["file_type"]].text.strip().lower() if file_type == "text": file_type = "txt" # for untyped binary files, just save .mystery and we'll worry later if (not file_type) or (file_type.strip() == ""): file_type = "mystery" # TODO: handle unexpected file types more gracefully # right now, it accepts any extension and dl's them. # it should choke on unexpected types, and email admin. # this should correspond with it being unreleased if file_type.startswith("contact"): unreleased = True exemptions = links[headers["exemptions"]].text.strip() if exemptions == "N/A": exemptions = None retention = links[headers["retention"]].text.strip() if retention == "N/A": retention = None file_size = links[headers["file_size"]].text.strip() record = { "type": "record", "landing_id": doc_id, "landing_url": url, "agency": agency, "year": year, "request_id": request_id, "title": title, "released_on": released_on, "released_original": released_date, "author": author, "exemptions": exemptions, "retention": retention } if unreleased: record["unreleased"] = True else: record["unreleased"] = False record["file_size"] = file_size record["file_type"] = file_type # ephemeral, used below to download, and kept for record-keeping record["download_url"] = download_url # 1) write JSON to disk at predictable path utils.write(utils.json_for(record), json_path) # 2) download the associated record doc (unless dry run) if unreleased: logging.warn("\tUnreleased doc, moving on.") elif options.get('skip_doc') is None: logging.warn("\tDownloading...") text_types = ('txt') binary_types = ('pdf', 'doc', 'docx', 'xls', 'xlsx', 'mystery', '') doc_path = data_path_for("record", agency, year, doc_id, record['file_type']) result = utils.download( download_url, doc_path, { 'binary': True, 'cache': not (options.get('force', False)) } ) # PDF extraction is easy enough if result and (record['file_type'] == 'pdf'): logging.warn("\tExtracting text from PDF...") utils.text_from_pdf(doc_path) return True
def paginated_mode(suffix, options, uid, api_key): # Cache hostnames in a dict for de-duping. hostnames_map = {} certificate_api = certificates.CensysCertificates(uid, api_key) if 'query' in options and options['query']: query = options['query'] else: query = "parsed.subject.common_name:\"%s\" or parsed.extensions.subject_alt_name.dns_names:\"%s\"" % ( suffix, suffix) logging.debug("Censys query:\n%s\n" % query) # time to sleep between requests (defaults to 5s) delay = int(options.get("delay", 5)) # Censys page size, fixed page_size = 100 # Start page defaults to 1. start_page = int(options.get("start", 1)) # End page defaults to whatever the API says is the last one. end_page = options.get("end", None) if end_page is None: end_page = get_end_page(query, certificate_api) if end_page is None: logging.warn("Error looking up number of pages.") exit(1) else: end_page = int(end_page) max_records = ((end_page - start_page) + 1) * page_size fields = [ "parsed.subject.common_name", "parsed.extensions.subject_alt_name.dns_names" ] current_page = start_page logging.warn("Fetching up to %i records, starting at page %i." % (max_records, start_page)) last_cached = False force = options.get("force", False) while current_page <= end_page: if (not last_cached) and (current_page > start_page): logging.debug("(Waiting %is before fetching page %i.)" % (delay, current_page)) last_cached = False time.sleep(delay) logging.debug("Fetching page %i." % current_page) cache_page = utils.cache_path(str(current_page), "censys") if (force is False) and (os.path.exists(cache_page)): logging.warn("\t[%i] Cached page." % current_page) last_cached = True certs_raw = open(cache_page).read() certs = json.loads(certs_raw) if (certs.__class__ is dict) and certs.get('invalid'): continue else: try: certs = list( certificate_api.search(query, fields=fields, page=current_page, max_records=page_size)) utils.write(utils.json_for(certs), cache_page) except censys.base.CensysException: logging.warn(utils.format_last_exception()) logging.warn("Censys error, skipping page %i." % current_page) utils.write(utils.invalid({}), cache_page) continue except: logging.warn(utils.format_last_exception()) logging.warn("Unexpected error, skipping page %i." % current_page) utils.write(utils.invalid({}), cache_page) exit(1) for cert in certs: # Common name + SANs names = cert.get('parsed.subject.common_name', []) + cert.get( 'parsed.extensions.subject_alt_name.dns_names', []) logging.debug(names) for name in names: hostnames_map[sanitize_name(name)] = None current_page += 1 logging.debug("Done fetching from API.") return hostnames_map
def get_record(agency, year, doc_id, options): # meta_path = meta_path_for("record", agency, year, doc_id) json_path = data_path_for("record", agency, year, doc_id, "json") ## Special: resume mode # # since downloading documents requires re-downloading landing pages, # the only way to do a resumable mass download of docs is to # check for whether the doc has been downloaded yet. this, in turn, # requires checking for the presence of [doc_type].[file_type], and the # file_type needs to be loaded from [doc_type].json. # # So, if --resume is on, before re-downloading anything, check if we # have a parsed .json, and if so, load the file_type and check for # the doc itself. If present, return True and move on. if options.get("resume"): if os.path.exists(json_path): data = json.load(open(json_path)) # it's an unreleased doc, move on anyway if data["unreleased"]: logging.warn("[%s][%s][%s][%s] Unreleased, skipping." % ("record", agency, year, doc_id)) return True doc_path = data_path_for("record", agency, year, doc_id, data["file_type"]) if os.path.exists(doc_path): logging.warn("[%s][%s][%s][%s] Already done, skipping." % ("record", agency, year, doc_id)) return True logging.warn("[%s][%s][%s][%s] Getting record..." % ("record", agency, year, doc_id)) # meta = json.load(open(meta_path)) # download landing page for record url = "https://foiaonline.regulations.gov/foia/action/public/view/record?objectId=%s" % doc_id # save the landing page no matter what, but only use it as a cache # if we're skipping the docs (download links are ephemeral :( ) body = utils.download(url, cache_path_for("record", agency, year, doc_id), {'cache': options.get('skip_doc', False)}) # assume released unreleased = False doc = BeautifulSoup(body) main = doc.select("#mainForm") if main: main = main[0] else: logging.warn( "[%s][%s][%s][%s] Landing page is not available, skipping." % ("record", agency, year, doc_id)) return True # get some other metadata about the record headers = record_headers_from(doc) # now clear the labels so text can be more easily extracted for label in main.select("fieldset .formitem label"): label.extract() links = main.select("fieldset .formitem") # get the actual document download link/ID download_link = links[headers["title"]].select("a") if len(download_link) > 0: download_url = download_link[0]['href'] download_url = "https://foiaonline.regulations.gov" + download_url # no link means it's not released else: unreleased = True title = links[headers["title"]].text.strip() author = links[headers["author"]].text.strip() if author == "N/A": author = None released_date = links[headers["released_on"]].text.strip() if released_date == "N/A": released_on = None else: try: released_at = parse(released_date) released_on = released_at.strftime("%Y-%m-%d") except TypeError: released_on = None request_id = links[headers["request"]].text.strip() file_type = links[headers["file_type"]].text.strip().lower() if file_type == "text": file_type = "txt" # for untyped binary files, just save .mystery and we'll worry later if (not file_type) or (file_type.strip() == ""): file_type = "mystery" # TODO: handle unexpected file types more gracefully # right now, it accepts any extension and dl's them. # it should choke on unexpected types, and email admin. # this should correspond with it being unreleased if file_type.startswith("contact"): unreleased = True exemptions = links[headers["exemptions"]].text.strip() if exemptions == "N/A": exemptions = None retention = links[headers["retention"]].text.strip() if retention == "N/A": retention = None file_size = links[headers["file_size"]].text.strip() record = { "type": "record", "landing_id": doc_id, "landing_url": url, "agency": agency, "year": year, "request_id": request_id, "title": title, "released_on": released_on, "released_original": released_date, "author": author, "exemptions": exemptions, "retention": retention } if unreleased: record["unreleased"] = True else: record["unreleased"] = False record["file_size"] = file_size record["file_type"] = file_type # ephemeral, used below to download, and kept for record-keeping record["download_url"] = download_url # 1) write JSON to disk at predictable path utils.write(utils.json_for(record), json_path) # 2) download the associated record doc (unless dry run) if unreleased: logging.warn("\tUnreleased doc, moving on.") elif options.get('skip_doc') is None: logging.warn("\tDownloading...") text_types = ('txt') binary_types = ('pdf', 'doc', 'docx', 'xls', 'xlsx', 'mystery', '') doc_path = data_path_for("record", agency, year, doc_id, record['file_type']) result = utils.download(download_url, doc_path, { 'binary': True, 'cache': not (options.get('force', False)) }) # PDF extraction is easy enough if result and (record['file_type'] == 'pdf'): logging.warn("\tExtracting text from PDF...") utils.text_from_pdf(doc_path) return True