Exemple #1
0
def init(options):
    """
    Download the Chrome preload list at the beginning of the scan, and
    re-use it for each scan. It is unnecessary to re-download the list for each
    scan because it changes infrequently.
    """
    global exclude_list
    global parents_list
    exclude_path = options.get("subdomains-exclude", None)
    parents_path = options.get("subdomains-parents", None)

    if (exclude_path is None) or (parents_path is None):
        logging.warn(
            "Specify CSVs with --subdomains-exclude and --subdomains-parents.")
        return False

    # list of subdomains to manually exclude
    exclude_list = utils.load_domains(exclude_path)

    # make a map of {'domain.gov': 'name of owner'}
    parents_list = utils.load_domains(parents_path, whole_rows=True)
    for domain_info in parents_list:
        domain_map[domain_info[0]] = domain_info[2]

    return True
Exemple #2
0
def gather(suffix, options):
    url = options.get("url")
    if url is None:
        logging.warn("A --url is required. (Can be a local path.)")
        exit(1)

    # remote URL
    if url.startswith("http:") or url.startswith("https:"):
        # Though it's saved in cache/, it will be downloaded every time.
        remote_path = os.path.join(utils.cache_dir(), "url.csv")

        try:
            response = requests.get(url)
            utils.write(response.text, remote_path)
        except:
            logging.error("Remote URL not downloaded successfully.")
            print(utils.format_last_exception())
            exit(1)

    # local path
    else:
        remote_path = url

    for domain in utils.load_domains(remote_path):
        yield domain
Exemple #3
0
def init(options):
    global analytics_domains

    analytics_file = options.get("analytics")
    if (not analytics_file) or (not analytics_file.endswith(".csv")):
        no_csv = "--analytics should point to the file path or URL to a CSV of participating domains."
        logging.error(no_csv)
        return False

    # It's a URL, download it first.
    if analytics_file.startswith("http:") or analytics_file.startswith("https:"):

        analytics_path = os.path.join(utils.cache_dir(), "analytics.csv")

        try:
            response = requests.get(analytics_file)
            utils.write(response.text, analytics_path)
        except:
            no_csv = "--analytics URL not downloaded successfully."
            logging.error(no_csv)
            return False

    # Otherwise, read it off the disk
    else:
        analytics_path = analytics_file

        if (not os.path.exists(analytics_path)):
            no_csv = "--analytics file not found."
            logging.error(no_csv)
            return False

    analytics_domains = utils.load_domains(analytics_path)

    return True
Exemple #4
0
def init(options):
    global exclude_list
    global parents_list
    exclude_path = options.get("subdomains-exclude", None)
    parents_path = options.get("subdomains-parents", None)

    if (exclude_path is None) or (parents_path is None):
        logging.warn("Specify CSVs with --subdomains-exclude and --subdomains-parents.")
        return False

    # list of subdomains to manually exclude
    exclude_list = utils.load_domains(exclude_path)

    # make a map of {'domain.gov': 'name of owner'}
    parents_list = utils.load_domains(parents_path, whole_rows=True)
    for domain_info in parents_list:
        domain_map[domain_info[0]] = domain_info[2]

    return True
Exemple #5
0
def init(options):
    global exclude_list
    global parents_list
    exclude_path = options.get("subdomains-exclude", None)
    parents_path = options.get("subdomains-parents", None)

    if (exclude_path is None) or (parents_path is None):
        logging.warn(
            "Specify CSVs with --subdomains-exclude and --subdomains-parents.")
        return False

    # list of subdomains to manually exclude
    exclude_list = utils.load_domains(exclude_path)

    # make a map of {'domain.gov': 'name of owner'}
    parents_list = utils.load_domains(parents_path, whole_rows=True)
    for domain_info in parents_list:
        domain_map[domain_info[0]] = domain_info[2]

    return True
Exemple #6
0
def init(options):
    global analytics_domains

    analytics_file = options.get("analytics")
    if ((not analytics_file) or
            (not analytics_file.endswith(".csv")) or
            (not os.path.exists(analytics_file))):
        no_csv = "--analytics should point to a CSV of participating domains."
        logging.error(no_csv)
        return False

    analytics_domains = utils.load_domains(analytics_file)

    return True
Exemple #7
0
def init(options):
    """
    Download the Chrome preload list at the beginning of the scan, and
    re-use it for each scan. It is unnecessary to re-download the list for each
    scan because it changes infrequently.
    """
    global exclude_list
    global parents_list
    exclude_path = options.get("subdomains-exclude", None)
    parents_path = options.get("subdomains-parents", None)

    if (exclude_path is None) or (parents_path is None):
        logging.warn("Specify CSVs with --subdomains-exclude and --subdomains-parents.")
        return False

    # list of subdomains to manually exclude
    exclude_list = utils.load_domains(exclude_path)

    # make a map of {'domain.gov': 'name of owner'}
    parents_list = utils.load_domains(parents_path, whole_rows=True)
    for domain_info in parents_list:
        domain_map[domain_info[0]] = domain_info[2]

    return True
Exemple #8
0
def gather(suffixes, options, extra={}):

    # Returns a parsed, processed Google service credentials object.
    credentials = load_credentials()

    if credentials is None:
        logging.warn("No BigQuery credentials provided.")
        logging.warn(
            "Set BIGQUERY_CREDENTIALS or BIGQUERY_CREDENTIALS_PATH environment variables."
        )
        exit(1)

    # When using this form of instantiation, the client won't pull
    # the project_id out of the creds, has to be set explicitly.
    client = bigquery.Client(project=credentials.project_id,
                             credentials=credentials)

    # Allow override of default timeout (in seconds).
    timeout = int(options.get("timeout", default_timeout))

    # Construct the query.
    query = query_for(suffixes)
    logging.debug("Censys query:\n%s\n" % query)

    # Plan to store in cache/censys/export.csv.
    download_path = utils.cache_path("export", "censys", ext="csv")

    # Reuse of cached data can be turned on with --cache.
    cache = options.get("cache", False)
    if (cache is True) and os.path.exists(download_path):
        logging.warn("Using cached download data.")

    # But by default, fetch new data from the BigQuery API,
    # and write it to the expected download location.
    else:
        logging.warn("Kicking off SQL query job.")

        rows = None

        # Actually execute the query.
        try:
            # Executes query and loads all results into memory.
            query_job = client.query(query)
            iterator = query_job.result(timeout=timeout)
            rows = list(iterator)
        except google.api_core.exceptions.Forbidden:
            logging.warn("Access denied to Censys' BigQuery tables.")
        except:
            logging.warn(utils.format_last_exception())
            logging.warn("Error talking to BigQuery, aborting.")

        # At this point, the job is complete and we need to download
        # the resulting CSV URL in results_url.
        logging.warn("Caching results of SQL query.")

        download_file = open(download_path, 'w', newline='')
        download_writer = csv.writer(download_file)
        download_writer.writerow(["Domain"])  # will be skipped on read

        # Parse the rows and write them out as they were returned (dupes
        # and all), to be de-duped by the central gathering script.
        for row in rows:
            domains = row['common_name'] + row['dns_names']
            for domain in domains:
                download_writer.writerow([domain])

        # End CSV writing.
        download_file.close()

    # Whether we downloaded it fresh or not, read from the cached data.
    for domain in utils.load_domains(download_path):
        if domain:
            yield domain