def update( ctx: click.core.Context, owners: typing.IO[str], domains: typing.IO[str], ciphers: typing.IO[str] ) -> None: with models.Connection(ctx.obj.get("connection_string")) as connection: update_data(owners, domains, ciphers, connection) LOGGER.info("'tracker update' completed.")
def insert( ctx: click.core.Context, owners: typing.IO[str], domains: typing.IO[str], ciphers: typing.IO[str], upsert: bool, ) -> None: with models.Connection(ctx.obj.get("connection_string")) as connection: insert_data(owners, domains, ciphers, upsert, connection)
def connection(request: _pytest.fixtures.SubRequest) -> typing.Iterator[models.Connection]: if not local_mongo_is_running(): pytest.skip('Local MongoDB instance is not running.') connection_string = request.param with models.Connection(connection_string) as connection: yield connection with pymongo.MongoClient(connection_string) as client: try: client.drop_database(client.get_database()) except pymongo.errors.ConfigurationError: client.drop_database('track')
def insert( ctx: click.core.Context, owners: typing.IO[str], domains: typing.IO[str], ciphers: typing.IO[str], ) -> None: owners_reader = csv.DictReader(owners) domains_reader = csv.DictReader(domains) ciphers_reader = csv.DictReader(ciphers) with models.Connection(ctx.obj.get('connection_string')) as connection: connection.owners.create_all(document for document in owners_reader) connection.input_domains.create_all(document for document in domains_reader) connection.ciphers.create_all(document for document in ciphers_reader)
def run(date: typing.Optional[str], connection_string: str, batch_size: typing.Optional[int] = None): if date is None: date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d") # Read in domains and organizations from domains.csv. # Returns dicts of values ready for saving as Domain and Agency objects. # # Also returns gathered subdomains, which need more filtering to be useful. domains, owners = load_domain_data() results = {} acceptable_ciphers = load_compliance_data() # Read in domain-scan CSV data. scan_data = load_scan_data(domains) # Capture manual exclusions and pull out some high-level data from pshtt. for domain_name in scan_data: # Pull out a few pshtt.csv fields as general domain-level metadata. domain_data = scan_data[domain_name] pshtt = domain_data.get("pshtt", None) if pshtt is None: # generally means scan was on different domains.csv, but # invalid domains can hit this. LOGGER.warning("[%s] No pshtt data for domain!", domain_name) elif boolean_for(pshtt['Live']): if boolean_for(pshtt['Redirect']): redirection = urlparse(pshtt["Redirect To"]).netloc if redirection not in domains: LOGGER.warning( "%s redirected to %s which is not in the domains list", domain_name, redirection) results[domain_name] = { "domain": domain_name, "is_owner": domain_name in owners, "is_parent": domain_name in owners, "sources": ["canada-gov"], "live": True, "redirect": boolean_for(pshtt["Redirect"]), "canonical": pshtt["Canonical URL"], "exclude": {}, } # Find the parent domain for all domains in the owner list, mutating results in place map_subdomains(results, owners) # Extract organizations actually used in the set of scanned domains, and their counts organizations = extract_orgs(results) sorted_domains = list(results.keys()) sorted_domains.sort() sorted_organizations = list(organizations.keys()) sorted_organizations.sort() # Calculate high-level per-domain conclusions for each report. # Overwrites `results` in place process_https(results, scan_data, acceptable_ciphers) # Totals scan data for parent domains total_reports( results, owners, ) # Calculate organization-level summaries. Updates `organizations` in-place. update_organization_totals(organizations, results) # Calculate government-wide summaries. report = full_report(results) report["report_date"] = date # Reset the database. with models.Connection(connection_string) as connection: LOGGER.info("Clearing the domains.") connection.domains.clear(batch_size=batch_size) LOGGER.info("Creating all domains.") connection.domains.create_all( (results[domain_name] for domain_name in sorted_domains), batch_size=batch_size) LOGGER.info("Clearing organizations.") connection.organizations.clear(batch_size=batch_size) LOGGER.info("Creating all organizations.") connection.organizations.create_all( (organizations[organization_name] for organization_name in sorted_organizations), batch_size=batch_size) LOGGER.info("Replacing government-wide totals.") connection.reports.replace({}, report) LOGGER.info("Signal track-web to drop cache") connection.flags.replace({}, {"cache": False}) # Print and exit print_report(report)
def preprocess(ctx: click.core.Context, output: typing.Optional[str]) -> None: if not output: output = os.path.join(os.getcwd(), "csv") with models.Connection(ctx.obj.get("connection_string")) as connection: pull_data(output, connection)
def run(date: typing.Optional[str], connection_string: str, batch_size: typing.Optional[int] = None): if date is None: date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d") # Read in domains and organizations from domains.csv. # Returns dicts of values ready for saving as Domain and Agency objects. # # Also returns gathered subdomains, which need more filtering to be useful. domains, owners = load_domain_data() results = {} acceptable_ciphers = load_compliance_data() # Read in domain-scan CSV data. scan_data = load_scan_data(domains) # Capture manual exclusions and pull out some high-level data from pshtt. for domain_name in scan_data: # Pull out a few pshtt.csv fields as general domain-level metadata. domain_data = scan_data[domain_name] pshtt = domain_data.get("pshtt", None) if pshtt is None: # generally means scan was on different domains.csv, but # invalid domains can hit this. LOGGER.warning("[%s] No pshtt data for domain!", domain_name) elif boolean_for(pshtt['Live']): if boolean_for(pshtt['Redirect']): redirection = urlparse(pshtt["Redirect To"]).netloc if redirection not in domains: LOGGER.warning( "%s redirected to %s which is not in the domains list", domain_name, redirection) results[domain_name] = { "domain": domain_name, "is_owner": domain_name in owners, "is_parent": domain_name in owners, "sources": ["canada-gov"], "live": True, "redirect": boolean_for(pshtt["Redirect"]), "redirect_url": urlparse(pshtt["Redirect To"]).geturl(), "canonical": pshtt["Canonical URL"], "exclude": {}, } # Find the parent domain for all domains in the owner list, mutating results in place map_subdomains(results, owners) # Extract organizations actually used in the set of scanned domains, and their counts organizations = extract_orgs(results) sorted_domains = list(results.keys()) sorted_domains.sort() sorted_organizations = list(organizations.keys()) sorted_organizations.sort() # Calculate high-level per-domain conclusions for each report. # Overwrites `results` in place process_https(results, scan_data, acceptable_ciphers) # Totals scan data for parent domains total_reports( results, owners, ) # Calculate organization-level summaries. Updates `organizations` in-place. update_organization_totals(organizations, results) # Calculate government-wide summaries. report = full_report(results) report["report_date"] = date # Reset the database. with models.Connection(connection_string) as connection: LOGGER.info("Updating or creating all domains.") # get remote list of domains remote_in_domains = [ document['domain'] for document in connection.domains.all() ] # use set logic to find the set of domains that need to be removed id_removals = set(remote_in_domains) - set(sorted_domains) # add scan date in all domain records scan_date(results, date) connection.domains.upsert_all( (results[domain_name] for domain_name in sorted_domains), 'domain', batch_size=batch_size) LOGGER.info("Domain removals: %s", id_removals) # Delete domain results from 'domains' table for record in id_removals: resp = connection.domains.delete_one({"domain": record}) if resp.deleted_count != 1: LOGGER.error( "Failed deletion of domain from 'domains' collection: %s", record) else: LOGGER.warning("Domain deleted from 'domains' collection: %s", record) LOGGER.info("Updating or creating organizations.") # add scan date in all org records scan_date(organizations, date) # get remote list of org remote_in_org = [ document['slug'] for document in connection.organizations.all() ] connection.organizations.upsert_all( (organizations[organization_name] for organization_name in sorted_organizations), 'slug', batch_size=batch_size) # use set logic to find the set of input_domains that need to be removed id_removals = set(remote_in_org) - set(sorted_organizations) LOGGER.info("Organization removals: %s", id_removals) # Delete org results from 'organizations' table for record in id_removals: resp = connection.organizations.delete_one({"slug": record}) if resp.deleted_count != 1: LOGGER.error( "Failed deletion of organization from 'organizations' collection: %s", record) else: LOGGER.warning( "Organization deleted from 'organizations' collection: %s", record) LOGGER.info("Replacing government-wide totals.") connection.reports.replace({}, report) LOGGER.info("Saving report to historical collection") report2 = report.copy() # to be able to query reports by date report2['report_timestamp'] = datetime.datetime.today() connection.historical.create(report2) LOGGER.info("Update cache validity with current time for track-web") connection.flags.replace({}, { "cache": datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %H:%M") }) # Print and exit print_report(report)
def run(date: typing.Optional[str], connection_string: str): if date is None: date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d") # Read in domains and organizations from domains.csv. # Returns dicts of values ready for saving as Domain and Agency objects. # # Also returns gathered subdomains, which need more filtering to be useful. domains, domain_map = load_domain_data() acceptable_ciphers = load_compliance_data() # Read in domain-scan CSV data. scan_data = load_scan_data(domains) # Capture manual exclusions and pull out some high-level data from pshtt. for domain_name in scan_data: # Pull out a few pshtt.csv fields as general domain-level metadata. pshtt = scan_data[domain_name].get("pshtt", None) if pshtt is None: # generally means scan was on different domains.csv, but # invalid domains can hit this. LOGGER.warning("[%s] No pshtt data for domain!", domain_name) # Remove the domain from further consideration. # Destructive, so have this done last. del domain_map[domain_name] elif domain_name in domain_map: # LOGGER.info("[%s] Updating with pshtt metadata." % domain_name) domain_map[domain_name]["live"] = boolean_for(pshtt["Live"]) domain_map[domain_name]["redirect"] = boolean_for(pshtt["Redirect"]) domain_map[domain_name]["canonical"] = pshtt["Canonical URL"] elif boolean_for(pshtt['Live']): domain_map[domain_name] = { "domain": domain_name, "is_owner": False, "sources": ["canada-gov"], "live": True, "redirect": boolean_for(pshtt["Redirect"]), "canonical": pshtt["Canonical URL"], "exclude": {}, } map_subdomains(scan_data, domain_map) organizations = extract_orgs(domain_map) # Save what we've got to the database so far. sorted_domains = list(domain_map.keys()) sorted_domains.sort() sorted_organizations = list(organizations.keys()) sorted_organizations.sort() # Calculate high-level per-domain conclusions for each report. # Overwrites `domains` and `subdomains` in-place. process_domains( domain_map, scan_data, acceptable_ciphers ) # Reset the database. LOGGER.info("Clearing the database.") with models.Connection(connection_string) as connection: connection.domains.clear() connection.reports.clear() connection.organizations.clear() # Calculate organization-level summaries. Updates `organizations` in-place. update_organization_totals(organizations, domain_map) # Calculate government-wide summaries. report = full_report(domain_map) report["report_date"] = date LOGGER.info("Creating all domains.") connection.domains.create_all(domain_map[domain_name] for domain_name in sorted_domains) LOGGER.info("Creating all organizations.") connection.organizations.create_all( organizations[organization_name] for organization_name in sorted_organizations ) # Create top-level summaries. LOGGER.info("Creating government-wide totals.") connection.reports.create(report) # Print and exit print_report(report)