def paginated_mode(suffix, options, uid, api_key): # Cache hostnames in a dict for de-duping. hostnames_map = {} certificate_api = certificates.CensysCertificates(uid, api_key) if 'query' in options and options['query']: query = options['query'] else: query = "parsed.subject.common_name:\"%s\" or parsed.extensions.subject_alt_name.dns_names:\"%s\"" % ( suffix, suffix) logging.debug("Censys query:\n%s\n" % query) # time to sleep between requests (defaults to 5s) delay = int(options.get("delay", 5)) # Censys page size, fixed page_size = 100 # Start page defaults to 1. start_page = int(options.get("start", 1)) # End page defaults to whatever the API says is the last one. end_page = options.get("end", None) if end_page is None: end_page = get_end_page(query, certificate_api) if end_page is None: logging.warn("Error looking up number of pages.") exit(1) else: end_page = int(end_page) max_records = ((end_page - start_page) + 1) * page_size fields = [ "parsed.subject.common_name", "parsed.extensions.subject_alt_name.dns_names" ] current_page = start_page logging.warn("Fetching up to %i records, starting at page %i." % (max_records, start_page)) last_cached = False force = options.get("force", False) while current_page <= end_page: if (not last_cached) and (current_page > start_page): logging.debug("(Waiting %is before fetching page %i.)" % (delay, current_page)) last_cached = False time.sleep(delay) logging.debug("Fetching page %i." % current_page) cache_page = utils.cache_path(str(current_page), "censys") if (force is False) and (os.path.exists(cache_page)): logging.warn("\t[%i] Cached page." % current_page) last_cached = True certs_raw = open(cache_page).read() certs = json.loads(certs_raw) if (certs.__class__ is dict) and certs.get('invalid'): continue else: try: certs = list( certificate_api.search(query, fields=fields, page=current_page, max_records=page_size)) utils.write(utils.json_for(certs), cache_page) except censys.base.CensysException: logging.warn(utils.format_last_exception()) logging.warn("Censys error, skipping page %i." % current_page) utils.write(utils.invalid({}), cache_page) continue except: logging.warn(utils.format_last_exception()) logging.warn("Unexpected error, skipping page %i." % current_page) utils.write(utils.invalid({}), cache_page) exit(1) for cert in certs: # Common name + SANs names = cert.get('parsed.subject.common_name', []) + cert.get( 'parsed.extensions.subject_alt_name.dns_names', []) logging.debug(names) for name in names: hostnames_map[sanitize_name(name)] = None current_page += 1 logging.debug("Done fetching from API.") return hostnames_map
def basic_check(endpoint): logging.debug("pinging %s..." % endpoint.url) # Test the endpoint. At first: # # * Don't follow redirects. (Will only follow if necessary.) # If it's a 3XX, we'll ping again to follow redirects. This is # necessary to reliably scope any errors (e.g. TLS errors) to # the original endpoint. # # * Validate certificates. (Will figure out error if necessary.) try: req = ping(endpoint.url) endpoint.live = True if endpoint.protocol == "https": endpoint.https_valid = True except requests.exceptions.SSLError: # Retry with certificate validation disabled. try: req = ping(endpoint.url, verify=False) except requests.exceptions.SSLError: # If it's a protocol error or other, it's not live. endpoint.live = False logging.warn( "Unexpected SSL protocol (or other) error during retry.") return except requests.exceptions.RequestException: endpoint.live = False logging.warn( "Unexpected requests exception during retry. Printing error:") logging.warn(utils.format_last_exception()) return # If it was a certificate error of any kind, it's live. endpoint.live = True # Figure out the error(s). https_check(endpoint) # And this is the parent of ConnectionError and other things. # For example, "too many redirects". # See https://github.com/kennethreitz/requests/blob/master/requests/exceptions.py except requests.exceptions.RequestException: endpoint.live = False logging.warn("Unexpected requests exception.") return # Endpoint is live, analyze the response. endpoint.headers = req.headers endpoint.status = req.status_code if str(endpoint.status).startswith('3'): endpoint.redirect = True if endpoint.redirect: location_header = req.headers.get('Location') # Absolute redirects (e.g. "https://example.com/Index.aspx") if location_header.startswith("http:") or location_header.startswith( "https:"): immediate = location_header # Relative redirects (e.g. "Location: /Index.aspx"). # Construct absolute URI, relative to original request. else: immediate = urlparse.urljoin(endpoint.url, location_header) # Chase down the ultimate destination, ignoring any certificate warnings. ultimate_req = None try: ultimate_req = ping(endpoint.url, allow_redirects=True, verify=False) except requests.exceptions.RequestException: # Swallow connection errors, but we won't be saving redirect info. pass # Now establish whether the redirects were: # * internal (same exact hostname), # * within the zone (any subdomain within the parent domain) # * external (on some other parent domain) # The hostname of the endpoint (e.g. "www.agency.gov") subdomain_original = urlparse.urlparse(endpoint.url).hostname # The parent domain of the endpoint (e.g. "agency.gov") base_original = parent_domain_for(subdomain_original) # The hostname of the immediate redirect. # The parent domain of the immediate redirect. subdomain_immediate = urlparse.urlparse(immediate).hostname base_immediate = parent_domain_for(subdomain_immediate) endpoint.redirect_immediately_to = immediate endpoint.redirect_immediately_to_www = re.match( r'^https?://www\.', immediate) endpoint.redirect_immediately_to_https = immediate.startswith( "https://") endpoint.redirect_immediately_to_http = immediate.startswith("http://") endpoint.redirect_immediately_to_external = (base_original != base_immediate) endpoint.redirect_immediately_to_subdomain = ( (base_original == base_immediate) and (subdomain_original != subdomain_immediate)) if ultimate_req is not None: # For ultimate destination, use the URL we arrived at, # not Location header. Auto-resolves relative redirects. eventual = ultimate_req.url # The hostname of the eventual destination. # The parent domain of the eventual destination. subdomain_eventual = urlparse.urlparse(eventual).hostname base_eventual = parent_domain_for(subdomain_eventual) endpoint.redirect_eventually_to = eventual endpoint.redirect_eventually_to_https = eventual.startswith( "https://") endpoint.redirect_eventually_to_http = eventual.startswith( "http://") endpoint.redirect_eventually_to_external = (base_original != base_eventual) endpoint.redirect_eventually_to_subdomain = ( (base_original == base_eventual) and (subdomain_original != subdomain_eventual))
def export_mode(suffix, options, uid, api_key): # Cache hostnames in a dict for de-duping. hostnames_map = {} # Default timeout to 20 minutes. timeout = int(options.get("timeout", (60 * 60 * 20))) # Wait 5 seconds between checking on the job. between_jobs = 5 try: export_api = export.CensysExport(uid, api_key) except censys.base.CensysUnauthorizedException: logging.warn( "The Censys.io Export API rejected the provided Censys credentials. The credentials may be inaccurate, or you may need to request access from the Censys.io team." ) exit(1) # Uses a FLATTEN command in order to work around a BigQuery # error around multiple "repeated" fields. *shrug* query = "SELECT parsed.subject.common_name, parsed.extensions.subject_alt_name.dns_names from FLATTEN([certificates.certificates], parsed.extensions.subject_alt_name.dns_names) where parsed.subject.common_name LIKE \"%%%s\" OR parsed.extensions.subject_alt_name.dns_names LIKE \"%%%s\";" % ( suffix, suffix) logging.debug("Censys query:\n%s\n" % query) download_file = utils.cache_path("export", "censys", ext="csv") force = options.get("force", False) if (force is False) and os.path.exists(download_file): logging.warn("Using cached download data.") else: logging.warn("Kicking off SQL query job.") results_url = None try: job = export_api.new_job(query, format='csv', flatten=True) job_id = job['job_id'] started = datetime.datetime.now() while True: elapsed = (datetime.datetime.now() - started).seconds status = export_api.check_job(job_id) if status['status'] == 'error': logging.warn("Error from Censys: %s" % status['error']) exit(1) # Not expected, but better to explicitly handle. elif status['status'] == 'expired': logging.warn("Results are somehow expired, bailing.") exit(1) elif status['status'] == 'pending': logging.debug("[%is] Job still pending." % elapsed) time.sleep(between_jobs) elif status['status'] == 'success': logging.warn("[%is] Job complete!" % elapsed) results_url = status['download_paths'][0] break if (elapsed > timeout): logging.warn("Timeout waiting for job to complete.") exit(1) except censys.base.CensysException: logging.warn(utils.format_last_exception()) logging.warn("Censys error, aborting.") # At this point, the job is complete and we need to download # the resulting CSV URL in results_url. logging.warn("Downloading results of SQL query.") utils.download(results_url, download_file) # Read in downloaded CSV file, run any hostnames in each line # through the sanitizer, and de-dupe using the map. with open(download_file, newline='') as csvfile: for row in csv.reader(csvfile): if (not row[0]) or ( row[0].lower().startswith("parsed_subject_common_name")): continue names = [row[0].lower(), row[1].lower()] # logging.debug(names) for name in names: if name: hostnames_map[sanitize_name(name)] = None return hostnames_map