def run(date): if date is None: date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d") # Reset the database. print("Clearing the database.") models.clear_database() Report.create(date) # Read in domains and agencies from domains.csv. # Returns dicts of values ready for saving as Domain and Agency objects. domains, agencies = load_domain_data() # Read in domain-scan CSV data. scan_data = load_scan_data(domains) # Pull out a few inspect.csv fields as general domain metadata. for domain_name in scan_data.keys(): inspect = scan_data[domain_name].get('inspect', None) if inspect is None: # generally means scan was on different domains.csv, but # invalid domains can hit this (e.g. fed.us). print("[%s][WARNING] No inspect data for domain!" % domain_name) # Remove the domain from further consideration. del domains[domain_name] else: # print("[%s] Updating with inspection metadata." % domain_name) domains[domain_name]['live'] = boolean_for(inspect['Live']) domains[domain_name]['redirect'] = boolean_for(inspect['Redirect']) domains[domain_name]['canonical'] = inspect['Canonical'] # Save what we've got to the database so far. for domain_name in domains.keys(): Domain.create(domains[domain_name]) print("[%s] Created." % domain_name) for agency_name in agencies.keys(): Agency.create(agencies[agency_name]) # print("[%s] Created." % agency_name) # Calculate high-level per-domain conclusions for each report. domain_reports = process_domains(domains, agencies, scan_data) # Save them in the database. for report_type in domain_reports.keys(): for domain_name in domain_reports[report_type].keys(): print("[%s][%s] Adding report." % (report_type, domain_name)) Domain.add_report(domain_name, report_type, domain_reports[report_type][domain_name]) # Calculate agency-level summaries. update_agency_totals() # Create top-level summaries. reports = latest_reports() for report in reports: Report.update(report) print_report()
def run(date): if date is None: date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d") # Reset the database. print("Clearing the database.") models.clear_database() Report.create(date) # Read in domains and agencies from domains.csv. # Returns dicts of values ready for saving as Domain and Agency objects. domains, agencies = load_domain_data() # Read in domain-scan CSV data. scan_data = load_scan_data(domains) # Load in some manual exclusion data. analytics_ineligible = yaml.safe_load( open(os.path.join(this_dir, "ineligible/analytics.yml"))) analytics_ineligible_map = {} for domain in analytics_ineligible: analytics_ineligible_map[domain] = True # Pull out a few pshtt.csv fields as general domain metadata. for domain_name in scan_data.keys(): analytics = scan_data[domain_name].get('analytics', None) if analytics: ineligible = analytics_ineligible_map.get(domain_name, False) domains[domain_name]['exclude']['analytics'] = ineligible pshtt = scan_data[domain_name].get('pshtt', None) if pshtt is None: # generally means scan was on different domains.csv, but # invalid domains can hit this. print("[%s][WARNING] No pshtt data for domain!" % domain_name) # Remove the domain from further consideration. # Destructive, so have this done last. del domains[domain_name] else: # print("[%s] Updating with pshtt metadata." % domain_name) domains[domain_name]['live'] = boolean_for(pshtt['Live']) domains[domain_name]['redirect'] = boolean_for(pshtt['Redirect']) domains[domain_name]['canonical'] = pshtt['Canonical URL'] # Save what we've got to the database so far. for domain_name in domains.keys(): Domain.create(domains[domain_name]) print("[%s] Created." % domain_name) for agency_name in agencies.keys(): Agency.create(agencies[agency_name]) # print("[%s] Created." % agency_name) # Calculate high-level per-domain conclusions for each report. domain_reports = process_domains(domains, agencies, scan_data) # Save them in the database. for report_type in domain_reports.keys(): for domain_name in domain_reports[report_type].keys(): print("[%s][%s] Adding report." % (report_type, domain_name)) Domain.add_report(domain_name, report_type, domain_reports[report_type][domain_name]) # Calculate agency-level summaries. update_agency_totals() # Create top-level summaries. reports = latest_reports() for report in reports: Report.update(report) print_report()
def run(date): if date is None: date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d") # Reset the database. print("Clearing the database.") models.clear_database() Report.create(date) # Read in domains and agencies from domains.csv. # Returns dicts of values ready for saving as Domain and Agency objects. domains, agencies = load_domain_data() # Read in domain-scan CSV data. scan_data = load_scan_data(domains) # Load in some manual exclusion data. analytics_ineligible = yaml.safe_load(open(os.path.join(this_dir, "ineligible/analytics.yml"))) analytics_ineligible_map = {} for domain in analytics_ineligible: analytics_ineligible_map[domain] = True # Pull out a few pshtt.csv fields as general domain metadata. for domain_name in scan_data.keys(): analytics = scan_data[domain_name].get('analytics', None) if analytics: ineligible = analytics_ineligible_map.get(domain_name, False) domains[domain_name]['exclude']['analytics'] = ineligible pshtt = scan_data[domain_name].get('pshtt', None) if pshtt is None: # generally means scan was on different domains.csv, but # invalid domains can hit this. print("[%s][WARNING] No pshtt data for domain!" % domain_name) # Remove the domain from further consideration. # Destructive, so have this done last. del domains[domain_name] else: # print("[%s] Updating with pshtt metadata." % domain_name) domains[domain_name]['live'] = boolean_for(pshtt['Live']) domains[domain_name]['redirect'] = boolean_for(pshtt['Redirect']) domains[domain_name]['canonical'] = pshtt['Canonical URL'] # Save what we've got to the database so far. for domain_name in domains.keys(): Domain.create(domains[domain_name]) print("[%s] Created." % domain_name) for agency_name in agencies.keys(): Agency.create(agencies[agency_name]) # print("[%s] Created." % agency_name) # Calculate high-level per-domain conclusions for each report. domain_reports = process_domains(domains, agencies, scan_data) # Save them in the database. for report_type in domain_reports.keys(): for domain_name in domain_reports[report_type].keys(): print("[%s][%s] Adding report." % (report_type, domain_name)) Domain.add_report(domain_name, report_type, domain_reports[report_type][domain_name]) # Calculate agency-level summaries. update_agency_totals() # Create top-level summaries. reports = latest_reports() for report in reports: Report.update(report) print_report()
def run(date, options): if date is None: date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d") # Read in domains and agencies from domains.csv. # Returns dicts of values ready for saving as Domain and Agency objects. # # Also returns gathered subdomains, which need more filtering to be useful. domains, agencies, gathered_subdomains = load_domain_data() # Read in domain-scan CSV data. parent_scan_data = load_parent_scan_data(domains) subdomains, subdomain_scan_data = load_subdomain_scan_data( domains, parent_scan_data, gathered_subdomains) # Load in some manual exclusion data. analytics_ineligible = yaml.safe_load( open(os.path.join(this_dir, "ineligible/analytics.yml"))) analytics_ineligible_map = {} for domain in analytics_ineligible: analytics_ineligible_map[domain] = True # Capture manual exclusions and pull out some high-level data from pshtt. for domain_name in parent_scan_data.keys(): # mark manual ineligiblity for analytics if present analytics = parent_scan_data[domain_name].get('analytics', None) if analytics: ineligible = analytics_ineligible_map.get(domain_name, False) domains[domain_name]['exclude']['analytics'] = ineligible # Pull out a few pshtt.csv fields as general domain-level metadata. pshtt = parent_scan_data[domain_name].get('pshtt', None) if pshtt is None: # generally means scan was on different domains.csv, but # invalid domains can hit this. print("[%s][WARNING] No pshtt data for domain!" % domain_name) # Remove the domain from further consideration. # Destructive, so have this done last. del domains[domain_name] else: # print("[%s] Updating with pshtt metadata." % domain_name) domains[domain_name]['live'] = boolean_for(pshtt['Live']) domains[domain_name]['redirect'] = boolean_for(pshtt['Redirect']) domains[domain_name]['canonical'] = pshtt['Canonical URL'] # Prepare subdomains the same way for subdomain_name in subdomain_scan_data.keys(): pshtt = subdomain_scan_data[subdomain_name].get('pshtt') subdomains[subdomain_name]['live'] = boolean_for(pshtt['Live']) subdomains[subdomain_name]['redirect'] = boolean_for(pshtt['Redirect']) subdomains[subdomain_name]['canonical'] = pshtt['Canonical URL'] # Save what we've got to the database so far. sorted_domains = list(domains.keys()) sorted_domains.sort() sorted_subdomains = list(subdomains.keys()) sorted_subdomains.sort() sorted_agencies = list(agencies.keys()) sorted_agencies.sort() # Calculate high-level per-domain conclusions for each report. # Overwrites `domains` and `subdomains` in-place. process_domains(domains, agencies, subdomains, parent_scan_data, subdomain_scan_data) # Reset the database. print("Clearing the database.") models.clear_database() # Calculate agency-level summaries. Updates `agencies` in-place. update_agency_totals(agencies, domains, subdomains) # Calculate government-wide summaries. report = full_report(domains, subdomains) report['report_date'] = date print("Creating all domains.") Domain.create_all(domains[domain_name] for domain_name in sorted_domains) print("Creating all subdomains.") Domain.create_all(subdomains[subdomain_name] for subdomain_name in sorted_subdomains) print("Creating all agencies.") Agency.create_all(agencies[agency_name] for agency_name in sorted_agencies) # Create top-level summaries. print("Creating government-wide totals.") Report.create(report) # Print and exit print_report(report)