コード例 #1
0
ファイル: processing.py プロジェクト: byeskille/pulse
def run(date):
  if date is None:
    date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")

  # Reset the database.
  print("Clearing the database.")
  models.clear_database()
  Report.create(date)

  # Read in domains and agencies from domains.csv.
  # Returns dicts of values ready for saving as Domain and Agency objects.
  domains, agencies = load_domain_data()

  # Read in domain-scan CSV data.
  scan_data = load_scan_data(domains)

  # Pull out a few inspect.csv fields as general domain metadata.
  for domain_name in scan_data.keys():
    inspect = scan_data[domain_name].get('inspect', None)
    if inspect is None:
      # generally means scan was on different domains.csv, but
      # invalid domains can hit this (e.g. fed.us).
      print("[%s][WARNING] No inspect data for domain!" % domain_name)

      # Remove the domain from further consideration.
      del domains[domain_name]
    else:
      # print("[%s] Updating with inspection metadata." % domain_name)
      domains[domain_name]['live'] = boolean_for(inspect['Live'])
      domains[domain_name]['redirect'] = boolean_for(inspect['Redirect'])
      domains[domain_name]['canonical'] = inspect['Canonical']

  # Save what we've got to the database so far.

  for domain_name in domains.keys():
    Domain.create(domains[domain_name])
    print("[%s] Created." % domain_name)
  for agency_name in agencies.keys():
    Agency.create(agencies[agency_name])
    # print("[%s] Created." % agency_name)


  # Calculate high-level per-domain conclusions for each report.
  domain_reports = process_domains(domains, agencies, scan_data)
  # Save them in the database.
  for report_type in domain_reports.keys():
    for domain_name in domain_reports[report_type].keys():
      print("[%s][%s] Adding report." % (report_type, domain_name))
      Domain.add_report(domain_name, report_type, domain_reports[report_type][domain_name])

  # Calculate agency-level summaries.
  update_agency_totals()

  # Create top-level summaries.
  reports = latest_reports()
  for report in reports:
    Report.update(report)

  print_report()
コード例 #2
0
ファイル: processing.py プロジェクト: matthewcrist/pulse
def run(date):
    if date is None:
        date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")

    # Reset the database.
    print("Clearing the database.")
    models.clear_database()
    Report.create(date)

    # Read in domains and agencies from domains.csv.
    # Returns dicts of values ready for saving as Domain and Agency objects.
    domains, agencies = load_domain_data()

    # Read in domain-scan CSV data.
    scan_data = load_scan_data(domains)

    # Load in some manual exclusion data.
    analytics_ineligible = yaml.safe_load(
        open(os.path.join(this_dir, "ineligible/analytics.yml")))
    analytics_ineligible_map = {}
    for domain in analytics_ineligible:
        analytics_ineligible_map[domain] = True

    # Pull out a few pshtt.csv fields as general domain metadata.
    for domain_name in scan_data.keys():
        analytics = scan_data[domain_name].get('analytics', None)
        if analytics:
            ineligible = analytics_ineligible_map.get(domain_name, False)
            domains[domain_name]['exclude']['analytics'] = ineligible

        pshtt = scan_data[domain_name].get('pshtt', None)
        if pshtt is None:
            # generally means scan was on different domains.csv, but
            # invalid domains can hit this.
            print("[%s][WARNING] No pshtt data for domain!" % domain_name)

            # Remove the domain from further consideration.
            # Destructive, so have this done last.
            del domains[domain_name]
        else:
            # print("[%s] Updating with pshtt metadata." % domain_name)
            domains[domain_name]['live'] = boolean_for(pshtt['Live'])
            domains[domain_name]['redirect'] = boolean_for(pshtt['Redirect'])
            domains[domain_name]['canonical'] = pshtt['Canonical URL']

    # Save what we've got to the database so far.

    for domain_name in domains.keys():
        Domain.create(domains[domain_name])
        print("[%s] Created." % domain_name)
    for agency_name in agencies.keys():
        Agency.create(agencies[agency_name])
        # print("[%s] Created." % agency_name)

    # Calculate high-level per-domain conclusions for each report.
    domain_reports = process_domains(domains, agencies, scan_data)
    # Save them in the database.
    for report_type in domain_reports.keys():
        for domain_name in domain_reports[report_type].keys():
            print("[%s][%s] Adding report." % (report_type, domain_name))
            Domain.add_report(domain_name, report_type,
                              domain_reports[report_type][domain_name])

    # Calculate agency-level summaries.
    update_agency_totals()

    # Create top-level summaries.
    reports = latest_reports()
    for report in reports:
        Report.update(report)

    print_report()
コード例 #3
0
ファイル: processing.py プロジェクト: uncompiled/pulse
def run(date):
  if date is None:
    date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")

  # Reset the database.
  print("Clearing the database.")
  models.clear_database()
  Report.create(date)

  # Read in domains and agencies from domains.csv.
  # Returns dicts of values ready for saving as Domain and Agency objects.
  domains, agencies = load_domain_data()

  # Read in domain-scan CSV data.
  scan_data = load_scan_data(domains)

  # Load in some manual exclusion data.
  analytics_ineligible = yaml.safe_load(open(os.path.join(this_dir, "ineligible/analytics.yml")))
  analytics_ineligible_map = {}
  for domain in analytics_ineligible:
    analytics_ineligible_map[domain] = True

  # Pull out a few pshtt.csv fields as general domain metadata.
  for domain_name in scan_data.keys():
    analytics = scan_data[domain_name].get('analytics', None)
    if analytics:
      ineligible = analytics_ineligible_map.get(domain_name, False)
      domains[domain_name]['exclude']['analytics'] = ineligible


    pshtt = scan_data[domain_name].get('pshtt', None)
    if pshtt is None:
      # generally means scan was on different domains.csv, but
      # invalid domains can hit this.
      print("[%s][WARNING] No pshtt data for domain!" % domain_name)

      # Remove the domain from further consideration.
      # Destructive, so have this done last.
      del domains[domain_name]
    else:
      # print("[%s] Updating with pshtt metadata." % domain_name)
      domains[domain_name]['live'] = boolean_for(pshtt['Live'])
      domains[domain_name]['redirect'] = boolean_for(pshtt['Redirect'])
      domains[domain_name]['canonical'] = pshtt['Canonical URL']


  # Save what we've got to the database so far.

  for domain_name in domains.keys():
    Domain.create(domains[domain_name])
    print("[%s] Created." % domain_name)
  for agency_name in agencies.keys():
    Agency.create(agencies[agency_name])
    # print("[%s] Created." % agency_name)


  # Calculate high-level per-domain conclusions for each report.
  domain_reports = process_domains(domains, agencies, scan_data)
  # Save them in the database.
  for report_type in domain_reports.keys():
    for domain_name in domain_reports[report_type].keys():
      print("[%s][%s] Adding report." % (report_type, domain_name))
      Domain.add_report(domain_name, report_type, domain_reports[report_type][domain_name])

  # Calculate agency-level summaries.
  update_agency_totals()

  # Create top-level summaries.
  reports = latest_reports()
  for report in reports:
    Report.update(report)

  print_report()
コード例 #4
0
ファイル: processing.py プロジェクト: isabella232/pulse-labs
def run(date, options):
    if date is None:
        date = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")

    # Read in domains and agencies from domains.csv.
    # Returns dicts of values ready for saving as Domain and Agency objects.
    #
    # Also returns gathered subdomains, which need more filtering to be useful.
    domains, agencies, gathered_subdomains = load_domain_data()

    # Read in domain-scan CSV data.
    parent_scan_data = load_parent_scan_data(domains)
    subdomains, subdomain_scan_data = load_subdomain_scan_data(
        domains, parent_scan_data, gathered_subdomains)

    # Load in some manual exclusion data.
    analytics_ineligible = yaml.safe_load(
        open(os.path.join(this_dir, "ineligible/analytics.yml")))
    analytics_ineligible_map = {}
    for domain in analytics_ineligible:
        analytics_ineligible_map[domain] = True

    # Capture manual exclusions and pull out some high-level data from pshtt.
    for domain_name in parent_scan_data.keys():

        # mark manual ineligiblity for analytics if present
        analytics = parent_scan_data[domain_name].get('analytics', None)
        if analytics:
            ineligible = analytics_ineligible_map.get(domain_name, False)
            domains[domain_name]['exclude']['analytics'] = ineligible

        # Pull out a few pshtt.csv fields as general domain-level metadata.
        pshtt = parent_scan_data[domain_name].get('pshtt', None)
        if pshtt is None:
            # generally means scan was on different domains.csv, but
            # invalid domains can hit this.
            print("[%s][WARNING] No pshtt data for domain!" % domain_name)

            # Remove the domain from further consideration.
            # Destructive, so have this done last.
            del domains[domain_name]
        else:
            # print("[%s] Updating with pshtt metadata." % domain_name)
            domains[domain_name]['live'] = boolean_for(pshtt['Live'])
            domains[domain_name]['redirect'] = boolean_for(pshtt['Redirect'])
            domains[domain_name]['canonical'] = pshtt['Canonical URL']

    # Prepare subdomains the same way
    for subdomain_name in subdomain_scan_data.keys():
        pshtt = subdomain_scan_data[subdomain_name].get('pshtt')
        subdomains[subdomain_name]['live'] = boolean_for(pshtt['Live'])
        subdomains[subdomain_name]['redirect'] = boolean_for(pshtt['Redirect'])
        subdomains[subdomain_name]['canonical'] = pshtt['Canonical URL']

    # Save what we've got to the database so far.

    sorted_domains = list(domains.keys())
    sorted_domains.sort()
    sorted_subdomains = list(subdomains.keys())
    sorted_subdomains.sort()
    sorted_agencies = list(agencies.keys())
    sorted_agencies.sort()

    # Calculate high-level per-domain conclusions for each report.
    # Overwrites `domains` and `subdomains` in-place.
    process_domains(domains, agencies, subdomains, parent_scan_data,
                    subdomain_scan_data)

    # Reset the database.
    print("Clearing the database.")
    models.clear_database()

    # Calculate agency-level summaries. Updates `agencies` in-place.
    update_agency_totals(agencies, domains, subdomains)

    # Calculate government-wide summaries.
    report = full_report(domains, subdomains)
    report['report_date'] = date

    print("Creating all domains.")
    Domain.create_all(domains[domain_name] for domain_name in sorted_domains)
    print("Creating all subdomains.")
    Domain.create_all(subdomains[subdomain_name]
                      for subdomain_name in sorted_subdomains)
    print("Creating all agencies.")
    Agency.create_all(agencies[agency_name] for agency_name in sorted_agencies)

    # Create top-level summaries.
    print("Creating government-wide totals.")
    Report.create(report)

    # Print and exit
    print_report(report)