Ejemplo n.º 1
0
def write_report(report):
  data_path = "%s/%s/%s/report.json" % (report['inspector'], report['year'], report['report_id'])
  utils.write(
    utils.json_for(report),
    "%s/%s" % (utils.data_dir(), data_path)
  )
  return data_path
def run(options):
  # Input: A path to the root of a domain-scan output directory.
  scan_path = options.get("scan", ".")
  results_path = os.path.join(scan_path, "results")

  # Output: Where to put the post-processed results.
  output_path = options.get("output", ".")


  # Take the uswds scan data and cut it down to just the rows for
  # which the bad banner text is enabled, and only the data needed for it,
  # in JSON form.

  uswds_csv = os.path.join(results_path, "uswds.csv")

  # collect list of dicts to convert into JSON
  bad_banner = []
  uswds_present = []

  with open(uswds_csv, newline='') as csvfile:
    for dict_row in csv.DictReader(csvfile):

      has_bad_banner = utils.boolean_for(dict_row["USWDS Bad Banner Text"])
      is_uswds_present = utils.boolean_for(dict_row["USWDS Present"])

      if (has_bad_banner):
        bad_banner.append({
          'hostname': dict_row["Domain"],
          'base_domain': dict_row["Base Domain"],
          'scanned_url': dict_row["Scanned URL"]
        })

      if (is_uswds_present):
        uswds_present.append({
          'hostname': dict_row["Domain"],
          'base_domain': dict_row["Base Domain"],
          'scanned_url': dict_row["Scanned URL"]
        })

  # Save resulting JSON.
  bad_banner_data = utils.json_for(bad_banner)
  bad_banner_output = os.path.join(output_path, "bad_banner.json")
  utils.write(bad_banner_data, bad_banner_output)

  uswds_present_data = utils.json_for(uswds_present)
  uswds_present_output = os.path.join(output_path, "uswds_present.json")
  utils.write(uswds_present_data, uswds_present_output)
Ejemplo n.º 3
0
def save_meta_result(result):
  path = meta_path_for(result['type'], result['agency'], result['year'], result['id'])

  # for paged metadata, don't overwrite if we've got it already,
  # we don't keep anything that should change.
  if os.path.exists(path):
    logging.debug("[%s][%s] Knew about it, skipping." % (result['id'], result['type']))
  else:
    logging.warn("[%s][%s] Newly discovered, saving metadata." % (result['id'], result['type']))
    utils.write(utils.json_for(result), path)
Ejemplo n.º 4
0
def save_meta_result(result):
    path = meta_path_for(result['type'], result['agency'], result['year'],
                         result['id'])

    # for paged metadata, don't overwrite if we've got it already,
    # we don't keep anything that should change.
    if os.path.exists(path):
        logging.debug("[%s][%s] Knew about it, skipping." %
                      (result['id'], result['type']))
    else:
        logging.warn("[%s][%s] Newly discovered, saving metadata." %
                     (result['id'], result['type']))
        utils.write(utils.json_for(result), path)
Ejemplo n.º 5
0
def main():
    args = docopt.docopt(__doc__, version='v0.0.1')
    utils.configure_logging(args['--debug'])

    out_file = args['--output']

    # Read from a .csv, or allow domains on the command line.
    domains = []
    if args['INPUT'][0].endswith(".csv"):
        domains = utils.load_domains(args['INPUT'][0])
    else:
        domains = args['INPUT']

    # If the user wants to sort them, sort them in place.
    if args['--sorted']:
        domains.sort()

    options = {
        'user_agent': args['--user-agent'],
        'timeout': args['--timeout'],
        'preload_cache': args['--preload-cache'],
        'cache': args['--cache']
    }
    results = pshtt.inspect_domains(domains, options)

    # JSON can go to STDOUT, or to a file.
    if args['--json']:
        output = utils.json_for(results)
        if out_file is None:
            print(output)
        else:
            utils.write(output, out_file)
            logging.warn("Wrote results to %s." % out_file)
    # Markdwon can go to STDOUT, or to a file
    elif args['--markdown']:
        output = sys.stdout
        if out_file is not None:
            output = open(out_file, 'w')

        pshtt.md_for(results, output)

        if out_file is not None:
            output.close()
    # CSV always goes to a file.
    else:
        if args['--output'] is None:
            out_file = 'results.csv'
        pshtt.csv_for(results, out_file)
        logging.warn("Wrote results to %s." % out_file)
Ejemplo n.º 6
0
def scan(domain, options):
    logging.debug("[%s][pshtt]" % domain)

    # cache output from pshtt
    cache_pshtt = utils.cache_path(domain, "pshtt", ext="json")

    force = options.get("force", False)
    data = None

    if (force is False) and (os.path.exists(cache_pshtt)):
        logging.debug("\tCached.")
        raw = open(cache_pshtt).read()
        data = json.loads(raw)
        if (data.__class__ is dict) and data.get('invalid'):
            return None

    else:
        logging.debug("\t %s %s" % (command, domain))

        raw = utils.scan([
            command, domain, '--json', '--user-agent',
            '\"%s\"' % user_agent, '--timeout',
            str(timeout), '--preload-cache', preload_cache
        ])

        if not raw:
            utils.write(utils.invalid({}), cache_pshtt)
            logging.warn("\tBad news scanning, sorry!")
            return None

        data = json.loads(raw)
        utils.write(utils.json_for(data), utils.cache_path(domain, "pshtt"))

    # pshtt scanner uses JSON arrays, even for single items
    data = data[0]

    row = []
    for field in headers:
        value = data[field]

        # TODO: Fix this upstream
        if (field != "HSTS Header") and (field != "HSTS Max Age") and (
                field != "Redirect To"):
            if value is None:
                value = False

        row.append(value)

    yield row
Ejemplo n.º 7
0
def create_preload_list():
    preload_json = None

    if PRELOAD_CACHE and os.path.exists(PRELOAD_CACHE):
        logging.debug("Using cached Chrome preload list.")
        preload_json = json.loads(open(PRELOAD_CACHE).read())
    else:
        logging.debug("Fetching Chrome preload list from source...")

        # Downloads the chromium preloaded domain list and sets it to a global set
        file_url = 'https://chromium.googlesource.com/chromium/src/net/+/master/http/transport_security_state_static.json?format=TEXT'

        # TODO: proper try/except around this network request
        request = requests.get(file_url)
        raw = request.content

        # To avoid parsing the contents of the file out of the source tree viewer's
        # HTML, we download it as a raw file. googlesource.com Base64-encodes the
        # file to avoid potential content injection issues, so we need to decode it
        # before using it. https://code.google.com/p/gitiles/issues/detail?id=7
        raw = base64.b64decode(raw).decode('utf-8')

        # The .json file contains '//' comments, which are not actually valid JSON,
        # and confuse Python's JSON decoder. Begone, foul comments!
        raw = ''.join(
            [re.sub(r'^\s*//.*$', '', line) for line in raw.splitlines()])

        preload_json = json.loads(raw)

        if PRELOAD_CACHE:
            logging.debug("Caching preload list at %s" % PRELOAD_CACHE)
            utils.write(utils.json_for(preload_json), PRELOAD_CACHE)

    # For our purposes, we only care about entries that includeSubDomains
    fully_preloaded = []
    for entry in preload_json['entries']:
        if entry.get('include_subdomains', False) is True:
            fully_preloaded.append(entry['name'])

    return fully_preloaded
Ejemplo n.º 8
0
def do_document(result, page, options):
  if result.get('pdfLink') is None:
    print("\tERROR, no pdfLink for document.")
    return False

  document = clean_document(result)

  # can limit to a particular known document ID, for debugging
  limit_id = options.get('document_id')
  if limit_id and (limit_id != document['document_id']):
    print("\tSkipping, not requested.")
    return False

  # 1) write JSON to disk at predictable path
  json_path = path_for(page, document['document_id'], "json")
  utils.write(utils.json_for(document), json_path)

  # 2) download pdfLink (unless dry run)
  if options.get('dry_run') is None:
    print("\t%s" % document['document_id'])

    pdf_path = path_for(page, document['document_id'], document['file_type'])

    result = utils.download(
      document['url'],
      pdf_path,
      {
        'binary': (document['file_type'].lower() == 'pdf'),
        'cache': not (options.get('force', False))
      }
    )

    if result:
      utils.text_from_pdf(pdf_path)


  return True
Ejemplo n.º 9
0
def get_record(agency, year, doc_id, options):

  # meta_path = meta_path_for("record", agency, year, doc_id)
  json_path = data_path_for("record", agency, year, doc_id, "json")

  ## Special: resume mode
  #
  # since downloading documents requires re-downloading landing pages,
  # the only way to do a resumable mass download of docs is to
  # check for whether the doc has been downloaded yet. this, in turn,
  # requires checking for the presence of [doc_type].[file_type], and the
  # file_type needs to be loaded from [doc_type].json.
  #
  # So, if --resume is on, before re-downloading anything, check if we
  # have a parsed .json, and if so, load the file_type and check for
  # the doc itself. If present, return True and move on.
  if options.get("resume"):
    if os.path.exists(json_path):
      data = json.load(open(json_path))

      # it's an unreleased doc, move on anyway
      if data["unreleased"]:
        logging.warn("[%s][%s][%s][%s] Unreleased, skipping." % ("record", agency, year, doc_id))
        return True

      doc_path = data_path_for("record", agency, year, doc_id, data["file_type"])
      if os.path.exists(doc_path):
        logging.warn("[%s][%s][%s][%s] Already done, skipping." % ("record", agency, year, doc_id))
        return True

  logging.warn("[%s][%s][%s][%s] Getting record..." % ("record", agency, year, doc_id))
  # meta = json.load(open(meta_path))

  # download landing page for record
  url = "https://foiaonline.regulations.gov/foia/action/public/view/record?objectId=%s" % doc_id

  # save the landing page no matter what, but only use it as a cache
  # if we're skipping the docs (download links are ephemeral :( )
  body = utils.download(url,
    cache_path_for("record", agency, year, doc_id),
    {'cache': options.get('skip_doc', False)}
  )

  # assume released
  unreleased = False

  doc = BeautifulSoup(body)
  main = doc.select("#mainForm")
  if main:
    main = main[0]
  else:
    logging.warn("[%s][%s][%s][%s] Landing page is not available, skipping." % ("record", agency, year, doc_id))
    return True

  # get some other metadata about the record
  headers = record_headers_from(doc)

  # now clear the labels so text can be more easily extracted
  for label in main.select("fieldset .formitem label"):
    label.extract()

  links = main.select("fieldset .formitem")

  # get the actual document download link/ID
  download_link = links[headers["title"]].select("a")
  if len(download_link) > 0:
    download_url = download_link[0]['href']
    download_url = "https://foiaonline.regulations.gov" + download_url

  # no link means it's not released
  else:
    unreleased = True

  title = links[headers["title"]].text.strip()
  author = links[headers["author"]].text.strip()
  if author == "N/A": author = None

  released_date = links[headers["released_on"]].text.strip()
  if released_date == "N/A":
    released_on = None
  else:
    try:
      released_at = parse(released_date)
      released_on = released_at.strftime("%Y-%m-%d")
    except TypeError:
      released_on = None

  request_id = links[headers["request"]].text.strip()

  file_type = links[headers["file_type"]].text.strip().lower()
  if file_type == "text": file_type = "txt"

  # for untyped binary files, just save .mystery and we'll worry later
  if (not file_type) or (file_type.strip() == ""):
    file_type = "mystery"

  # TODO: handle unexpected file types more gracefully
  # right now, it accepts any extension and dl's them.
  # it should choke on unexpected types, and email admin.

  # this should correspond with it being unreleased
  if file_type.startswith("contact"):
    unreleased = True

  exemptions = links[headers["exemptions"]].text.strip()
  if exemptions == "N/A": exemptions = None
  retention = links[headers["retention"]].text.strip()
  if retention == "N/A": retention = None

  file_size = links[headers["file_size"]].text.strip()

  record = {
    "type": "record",
    "landing_id": doc_id,
    "landing_url": url,
    "agency": agency,
    "year": year,

    "request_id": request_id,
    "title": title,
    "released_on": released_on,
    "released_original": released_date,
    "author": author,
    "exemptions": exemptions,
    "retention": retention
  }

  if unreleased:
    record["unreleased"] = True
  else:
    record["unreleased"] = False
    record["file_size"] = file_size
    record["file_type"] = file_type
    # ephemeral, used below to download, and kept for record-keeping
    record["download_url"] = download_url

  # 1) write JSON to disk at predictable path
  utils.write(utils.json_for(record), json_path)

  # 2) download the associated record doc (unless dry run)
  if unreleased:
    logging.warn("\tUnreleased doc, moving on.")
  elif options.get('skip_doc') is None:
    logging.warn("\tDownloading...")

    text_types = ('txt')
    binary_types = ('pdf', 'doc', 'docx', 'xls', 'xlsx', 'mystery', '')
    doc_path = data_path_for("record", agency, year, doc_id, record['file_type'])

    result = utils.download(
      download_url,
      doc_path,
      {
        'binary': True,
        'cache': not (options.get('force', False))
      }
    )

    # PDF extraction is easy enough
    if result and (record['file_type'] == 'pdf'):
      logging.warn("\tExtracting text from PDF...")
      utils.text_from_pdf(doc_path)


  return True
Ejemplo n.º 10
0
def paginated_mode(suffix, options, uid, api_key):
    # Cache hostnames in a dict for de-duping.
    hostnames_map = {}

    certificate_api = certificates.CensysCertificates(uid, api_key)

    if 'query' in options and options['query']:
        query = options['query']
    else:
        query = "parsed.subject.common_name:\"%s\" or parsed.extensions.subject_alt_name.dns_names:\"%s\"" % (
            suffix, suffix)
    logging.debug("Censys query:\n%s\n" % query)

    # time to sleep between requests (defaults to 5s)
    delay = int(options.get("delay", 5))

    # Censys page size, fixed
    page_size = 100

    # Start page defaults to 1.
    start_page = int(options.get("start", 1))

    # End page defaults to whatever the API says is the last one.
    end_page = options.get("end", None)
    if end_page is None:
        end_page = get_end_page(query, certificate_api)
        if end_page is None:
            logging.warn("Error looking up number of pages.")
            exit(1)
    else:
        end_page = int(end_page)

    max_records = ((end_page - start_page) + 1) * page_size

    fields = [
        "parsed.subject.common_name",
        "parsed.extensions.subject_alt_name.dns_names"
    ]

    current_page = start_page

    logging.warn("Fetching up to %i records, starting at page %i." %
                 (max_records, start_page))
    last_cached = False
    force = options.get("force", False)

    while current_page <= end_page:
        if (not last_cached) and (current_page > start_page):
            logging.debug("(Waiting %is before fetching page %i.)" %
                          (delay, current_page))
            last_cached = False
            time.sleep(delay)

        logging.debug("Fetching page %i." % current_page)

        cache_page = utils.cache_path(str(current_page), "censys")
        if (force is False) and (os.path.exists(cache_page)):
            logging.warn("\t[%i] Cached page." % current_page)
            last_cached = True

            certs_raw = open(cache_page).read()
            certs = json.loads(certs_raw)
            if (certs.__class__ is dict) and certs.get('invalid'):
                continue
        else:
            try:
                certs = list(
                    certificate_api.search(query,
                                           fields=fields,
                                           page=current_page,
                                           max_records=page_size))
                utils.write(utils.json_for(certs), cache_page)
            except censys.base.CensysException:
                logging.warn(utils.format_last_exception())
                logging.warn("Censys error, skipping page %i." % current_page)
                utils.write(utils.invalid({}), cache_page)
                continue
            except:
                logging.warn(utils.format_last_exception())
                logging.warn("Unexpected error, skipping page %i." %
                             current_page)
                utils.write(utils.invalid({}), cache_page)
                exit(1)

        for cert in certs:
            # Common name + SANs
            names = cert.get('parsed.subject.common_name', []) + cert.get(
                'parsed.extensions.subject_alt_name.dns_names', [])
            logging.debug(names)

            for name in names:
                hostnames_map[sanitize_name(name)] = None

        current_page += 1

    logging.debug("Done fetching from API.")

    return hostnames_map
Ejemplo n.º 11
0
def get_record(agency, year, doc_id, options):

    # meta_path = meta_path_for("record", agency, year, doc_id)
    json_path = data_path_for("record", agency, year, doc_id, "json")

    ## Special: resume mode
    #
    # since downloading documents requires re-downloading landing pages,
    # the only way to do a resumable mass download of docs is to
    # check for whether the doc has been downloaded yet. this, in turn,
    # requires checking for the presence of [doc_type].[file_type], and the
    # file_type needs to be loaded from [doc_type].json.
    #
    # So, if --resume is on, before re-downloading anything, check if we
    # have a parsed .json, and if so, load the file_type and check for
    # the doc itself. If present, return True and move on.
    if options.get("resume"):
        if os.path.exists(json_path):
            data = json.load(open(json_path))

            # it's an unreleased doc, move on anyway
            if data["unreleased"]:
                logging.warn("[%s][%s][%s][%s] Unreleased, skipping." %
                             ("record", agency, year, doc_id))
                return True

            doc_path = data_path_for("record", agency, year, doc_id,
                                     data["file_type"])
            if os.path.exists(doc_path):
                logging.warn("[%s][%s][%s][%s] Already done, skipping." %
                             ("record", agency, year, doc_id))
                return True

    logging.warn("[%s][%s][%s][%s] Getting record..." %
                 ("record", agency, year, doc_id))
    # meta = json.load(open(meta_path))

    # download landing page for record
    url = "https://foiaonline.regulations.gov/foia/action/public/view/record?objectId=%s" % doc_id

    # save the landing page no matter what, but only use it as a cache
    # if we're skipping the docs (download links are ephemeral :( )
    body = utils.download(url, cache_path_for("record", agency, year, doc_id),
                          {'cache': options.get('skip_doc', False)})

    # assume released
    unreleased = False

    doc = BeautifulSoup(body)
    main = doc.select("#mainForm")
    if main:
        main = main[0]
    else:
        logging.warn(
            "[%s][%s][%s][%s] Landing page is not available, skipping." %
            ("record", agency, year, doc_id))
        return True

    # get some other metadata about the record
    headers = record_headers_from(doc)

    # now clear the labels so text can be more easily extracted
    for label in main.select("fieldset .formitem label"):
        label.extract()

    links = main.select("fieldset .formitem")

    # get the actual document download link/ID
    download_link = links[headers["title"]].select("a")
    if len(download_link) > 0:
        download_url = download_link[0]['href']
        download_url = "https://foiaonline.regulations.gov" + download_url

    # no link means it's not released
    else:
        unreleased = True

    title = links[headers["title"]].text.strip()
    author = links[headers["author"]].text.strip()
    if author == "N/A": author = None

    released_date = links[headers["released_on"]].text.strip()
    if released_date == "N/A":
        released_on = None
    else:
        try:
            released_at = parse(released_date)
            released_on = released_at.strftime("%Y-%m-%d")
        except TypeError:
            released_on = None

    request_id = links[headers["request"]].text.strip()

    file_type = links[headers["file_type"]].text.strip().lower()
    if file_type == "text": file_type = "txt"

    # for untyped binary files, just save .mystery and we'll worry later
    if (not file_type) or (file_type.strip() == ""):
        file_type = "mystery"

    # TODO: handle unexpected file types more gracefully
    # right now, it accepts any extension and dl's them.
    # it should choke on unexpected types, and email admin.

    # this should correspond with it being unreleased
    if file_type.startswith("contact"):
        unreleased = True

    exemptions = links[headers["exemptions"]].text.strip()
    if exemptions == "N/A": exemptions = None
    retention = links[headers["retention"]].text.strip()
    if retention == "N/A": retention = None

    file_size = links[headers["file_size"]].text.strip()

    record = {
        "type": "record",
        "landing_id": doc_id,
        "landing_url": url,
        "agency": agency,
        "year": year,
        "request_id": request_id,
        "title": title,
        "released_on": released_on,
        "released_original": released_date,
        "author": author,
        "exemptions": exemptions,
        "retention": retention
    }

    if unreleased:
        record["unreleased"] = True
    else:
        record["unreleased"] = False
        record["file_size"] = file_size
        record["file_type"] = file_type
        # ephemeral, used below to download, and kept for record-keeping
        record["download_url"] = download_url

    # 1) write JSON to disk at predictable path
    utils.write(utils.json_for(record), json_path)

    # 2) download the associated record doc (unless dry run)
    if unreleased:
        logging.warn("\tUnreleased doc, moving on.")
    elif options.get('skip_doc') is None:
        logging.warn("\tDownloading...")

        text_types = ('txt')
        binary_types = ('pdf', 'doc', 'docx', 'xls', 'xlsx', 'mystery', '')
        doc_path = data_path_for("record", agency, year, doc_id,
                                 record['file_type'])

        result = utils.download(download_url, doc_path, {
            'binary': True,
            'cache': not (options.get('force', False))
        })

        # PDF extraction is easy enough
        if result and (record['file_type'] == 'pdf'):
            logging.warn("\tExtracting text from PDF...")
            utils.text_from_pdf(doc_path)

    return True