Ejemplo n.º 1
0
def update(
        ctx: click.core.Context,
        owners: typing.IO[str],
        domains: typing.IO[str],
        ciphers: typing.IO[str]
) -> None:

    with models.Connection(ctx.obj.get("connection_string")) as connection:
        update_data(owners, domains, ciphers, connection)
        LOGGER.info("'tracker update' completed.")
Ejemplo n.º 2
0
def insert(
    ctx: click.core.Context,
    owners: typing.IO[str],
    domains: typing.IO[str],
    ciphers: typing.IO[str],
    upsert: bool,
) -> None:

    with models.Connection(ctx.obj.get("connection_string")) as connection:
        insert_data(owners, domains, ciphers, upsert, connection)
Ejemplo n.º 3
0
def connection(request: _pytest.fixtures.SubRequest) -> typing.Iterator[models.Connection]:
    if not local_mongo_is_running():
        pytest.skip('Local MongoDB instance is not running.')

    connection_string = request.param
    with models.Connection(connection_string) as connection:
        yield connection

    with pymongo.MongoClient(connection_string) as client:
        try:
            client.drop_database(client.get_database())
        except pymongo.errors.ConfigurationError:
            client.drop_database('track')
Ejemplo n.º 4
0
def insert(
    ctx: click.core.Context,
    owners: typing.IO[str],
    domains: typing.IO[str],
    ciphers: typing.IO[str],
) -> None:

    owners_reader = csv.DictReader(owners)
    domains_reader = csv.DictReader(domains)
    ciphers_reader = csv.DictReader(ciphers)

    with models.Connection(ctx.obj.get('connection_string')) as connection:
        connection.owners.create_all(document for document in owners_reader)
        connection.input_domains.create_all(document
                                            for document in domains_reader)
        connection.ciphers.create_all(document for document in ciphers_reader)
Ejemplo n.º 5
0
def run(date: typing.Optional[str],
        connection_string: str,
        batch_size: typing.Optional[int] = None):
    if date is None:
        date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")

    # Read in domains and organizations from domains.csv.
    # Returns dicts of values ready for saving as Domain and Agency objects.
    #
    # Also returns gathered subdomains, which need more filtering to be useful.
    domains, owners = load_domain_data()
    results = {}
    acceptable_ciphers = load_compliance_data()

    # Read in domain-scan CSV data.
    scan_data = load_scan_data(domains)

    # Capture manual exclusions and pull out some high-level data from pshtt.
    for domain_name in scan_data:
        # Pull out a few pshtt.csv fields as general domain-level metadata.
        domain_data = scan_data[domain_name]

        pshtt = domain_data.get("pshtt", None)
        if pshtt is None:
            # generally means scan was on different domains.csv, but
            # invalid domains can hit this.
            LOGGER.warning("[%s] No pshtt data for domain!", domain_name)
        elif boolean_for(pshtt['Live']):
            if boolean_for(pshtt['Redirect']):
                redirection = urlparse(pshtt["Redirect To"]).netloc
                if redirection not in domains:
                    LOGGER.warning(
                        "%s redirected to %s which is not in the domains list",
                        domain_name, redirection)

            results[domain_name] = {
                "domain": domain_name,
                "is_owner": domain_name in owners,
                "is_parent": domain_name in owners,
                "sources": ["canada-gov"],
                "live": True,
                "redirect": boolean_for(pshtt["Redirect"]),
                "canonical": pshtt["Canonical URL"],
                "exclude": {},
            }

    # Find the parent domain for all domains in the owner list, mutating results in place
    map_subdomains(results, owners)

    # Extract organizations actually used in the set of scanned domains, and their counts
    organizations = extract_orgs(results)

    sorted_domains = list(results.keys())
    sorted_domains.sort()
    sorted_organizations = list(organizations.keys())
    sorted_organizations.sort()

    # Calculate high-level per-domain conclusions for each report.
    # Overwrites `results` in place
    process_https(results, scan_data, acceptable_ciphers)
    # Totals scan data for parent domains
    total_reports(
        results,
        owners,
    )

    # Calculate organization-level summaries. Updates `organizations` in-place.
    update_organization_totals(organizations, results)

    # Calculate government-wide summaries.
    report = full_report(results)
    report["report_date"] = date

    # Reset the database.
    with models.Connection(connection_string) as connection:
        LOGGER.info("Clearing the domains.")
        connection.domains.clear(batch_size=batch_size)
        LOGGER.info("Creating all domains.")
        connection.domains.create_all(
            (results[domain_name] for domain_name in sorted_domains),
            batch_size=batch_size)

        LOGGER.info("Clearing organizations.")
        connection.organizations.clear(batch_size=batch_size)
        LOGGER.info("Creating all organizations.")
        connection.organizations.create_all(
            (organizations[organization_name]
             for organization_name in sorted_organizations),
            batch_size=batch_size)

        LOGGER.info("Replacing government-wide totals.")
        connection.reports.replace({}, report)

        LOGGER.info("Signal track-web to drop cache")
        connection.flags.replace({}, {"cache": False})

    # Print and exit
    print_report(report)
Ejemplo n.º 6
0
def preprocess(ctx: click.core.Context, output: typing.Optional[str]) -> None:
    if not output:
        output = os.path.join(os.getcwd(), "csv")

    with models.Connection(ctx.obj.get("connection_string")) as connection:
        pull_data(output, connection)
Ejemplo n.º 7
0
def run(date: typing.Optional[str],
        connection_string: str,
        batch_size: typing.Optional[int] = None):
    if date is None:
        date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")

    # Read in domains and organizations from domains.csv.
    # Returns dicts of values ready for saving as Domain and Agency objects.
    #
    # Also returns gathered subdomains, which need more filtering to be useful.
    domains, owners = load_domain_data()
    results = {}
    acceptable_ciphers = load_compliance_data()

    # Read in domain-scan CSV data.
    scan_data = load_scan_data(domains)

    # Capture manual exclusions and pull out some high-level data from pshtt.
    for domain_name in scan_data:
        # Pull out a few pshtt.csv fields as general domain-level metadata.
        domain_data = scan_data[domain_name]

        pshtt = domain_data.get("pshtt", None)
        if pshtt is None:
            # generally means scan was on different domains.csv, but
            # invalid domains can hit this.
            LOGGER.warning("[%s] No pshtt data for domain!", domain_name)
        elif boolean_for(pshtt['Live']):
            if boolean_for(pshtt['Redirect']):
                redirection = urlparse(pshtt["Redirect To"]).netloc
                if redirection not in domains:
                    LOGGER.warning(
                        "%s redirected to %s which is not in the domains list",
                        domain_name, redirection)

            results[domain_name] = {
                "domain": domain_name,
                "is_owner": domain_name in owners,
                "is_parent": domain_name in owners,
                "sources": ["canada-gov"],
                "live": True,
                "redirect": boolean_for(pshtt["Redirect"]),
                "redirect_url": urlparse(pshtt["Redirect To"]).geturl(),
                "canonical": pshtt["Canonical URL"],
                "exclude": {},
            }

    # Find the parent domain for all domains in the owner list, mutating results in place
    map_subdomains(results, owners)

    # Extract organizations actually used in the set of scanned domains, and their counts
    organizations = extract_orgs(results)

    sorted_domains = list(results.keys())
    sorted_domains.sort()
    sorted_organizations = list(organizations.keys())
    sorted_organizations.sort()

    # Calculate high-level per-domain conclusions for each report.
    # Overwrites `results` in place
    process_https(results, scan_data, acceptable_ciphers)
    # Totals scan data for parent domains
    total_reports(
        results,
        owners,
    )

    # Calculate organization-level summaries. Updates `organizations` in-place.
    update_organization_totals(organizations, results)

    # Calculate government-wide summaries.
    report = full_report(results)
    report["report_date"] = date

    # Reset the database.
    with models.Connection(connection_string) as connection:
        LOGGER.info("Updating or creating all domains.")

        # get remote list of domains
        remote_in_domains = [
            document['domain'] for document in connection.domains.all()
        ]

        # use set logic to find the set of domains that need to be removed
        id_removals = set(remote_in_domains) - set(sorted_domains)

        # add scan date in all domain records
        scan_date(results, date)

        connection.domains.upsert_all(
            (results[domain_name] for domain_name in sorted_domains),
            'domain',
            batch_size=batch_size)

        LOGGER.info("Domain removals: %s", id_removals)
        # Delete domain results from 'domains' table
        for record in id_removals:
            resp = connection.domains.delete_one({"domain": record})
            if resp.deleted_count != 1:
                LOGGER.error(
                    "Failed deletion of domain from 'domains' collection: %s",
                    record)
            else:
                LOGGER.warning("Domain deleted from 'domains' collection: %s",
                               record)

        LOGGER.info("Updating or creating organizations.")

        # add scan date in all org records
        scan_date(organizations, date)

        # get remote list of org
        remote_in_org = [
            document['slug'] for document in connection.organizations.all()
        ]

        connection.organizations.upsert_all(
            (organizations[organization_name]
             for organization_name in sorted_organizations),
            'slug',
            batch_size=batch_size)

        # use set logic to find the set of input_domains that need to be removed
        id_removals = set(remote_in_org) - set(sorted_organizations)

        LOGGER.info("Organization removals: %s", id_removals)

        # Delete org results from 'organizations' table
        for record in id_removals:
            resp = connection.organizations.delete_one({"slug": record})
            if resp.deleted_count != 1:
                LOGGER.error(
                    "Failed deletion of organization from 'organizations' collection: %s",
                    record)
            else:
                LOGGER.warning(
                    "Organization deleted from 'organizations' collection: %s",
                    record)

        LOGGER.info("Replacing government-wide totals.")
        connection.reports.replace({}, report)

        LOGGER.info("Saving report to historical collection")
        report2 = report.copy()
        # to be able to query reports by date
        report2['report_timestamp'] = datetime.datetime.today()
        connection.historical.create(report2)

        LOGGER.info("Update cache validity with current time for track-web")
        connection.flags.replace({}, {
            "cache":
            datetime.datetime.strftime(datetime.datetime.now(),
                                       "%Y-%m-%d %H:%M")
        })

    # Print and exit
    print_report(report)
Ejemplo n.º 8
0
def run(date: typing.Optional[str], connection_string: str):
    if date is None:
        date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")

    # Read in domains and organizations from domains.csv.
    # Returns dicts of values ready for saving as Domain and Agency objects.
    #
    # Also returns gathered subdomains, which need more filtering to be useful.
    domains, domain_map = load_domain_data()
    acceptable_ciphers = load_compliance_data()

    # Read in domain-scan CSV data.
    scan_data = load_scan_data(domains)

    # Capture manual exclusions and pull out some high-level data from pshtt.
    for domain_name in scan_data:
        # Pull out a few pshtt.csv fields as general domain-level metadata.
        pshtt = scan_data[domain_name].get("pshtt", None)
        if pshtt is None:
            # generally means scan was on different domains.csv, but
            # invalid domains can hit this.
            LOGGER.warning("[%s] No pshtt data for domain!", domain_name)

            # Remove the domain from further consideration.
            # Destructive, so have this done last.
            del domain_map[domain_name]
        elif domain_name in domain_map:
            # LOGGER.info("[%s] Updating with pshtt metadata." % domain_name)
            domain_map[domain_name]["live"] = boolean_for(pshtt["Live"])
            domain_map[domain_name]["redirect"] = boolean_for(pshtt["Redirect"])
            domain_map[domain_name]["canonical"] = pshtt["Canonical URL"]
        elif boolean_for(pshtt['Live']):
            domain_map[domain_name] = {
                "domain": domain_name,
                "is_owner": False,
                "sources": ["canada-gov"],
                "live": True,
                "redirect": boolean_for(pshtt["Redirect"]),
                "canonical": pshtt["Canonical URL"],
                "exclude": {},
            }

    map_subdomains(scan_data, domain_map)
    organizations = extract_orgs(domain_map)

    # Save what we've got to the database so far.
    sorted_domains = list(domain_map.keys())
    sorted_domains.sort()
    sorted_organizations = list(organizations.keys())
    sorted_organizations.sort()

    # Calculate high-level per-domain conclusions for each report.
    # Overwrites `domains` and `subdomains` in-place.
    process_domains(
        domain_map, scan_data, acceptable_ciphers
    )

    # Reset the database.
    LOGGER.info("Clearing the database.")
    with models.Connection(connection_string) as connection:
        connection.domains.clear()
        connection.reports.clear()
        connection.organizations.clear()

        # Calculate organization-level summaries. Updates `organizations` in-place.
        update_organization_totals(organizations, domain_map)

        # Calculate government-wide summaries.
        report = full_report(domain_map)
        report["report_date"] = date

        LOGGER.info("Creating all domains.")
        connection.domains.create_all(domain_map[domain_name] for domain_name in sorted_domains)
        LOGGER.info("Creating all organizations.")
        connection.organizations.create_all(
            organizations[organization_name] for organization_name in sorted_organizations
        )

        # Create top-level summaries.
        LOGGER.info("Creating government-wide totals.")
        connection.reports.create(report)

    # Print and exit
    print_report(report)