def definitions(base='.'):
  """
  Check that all definition.py files are valid.
  """
  def warn(message, slug):
    if message not in seen:
      print('%-50s %s' % (slug, message))
      seen.add(message)

  seen = set()
  division_ids = set()
  for slug, config in registry(base).items():
    directory = dirname(config['file'])

    # Validate LICENSE.txt.
    license_path = os.path.join(directory, 'LICENSE.txt')
    if os.path.exists(license_path):
      with open(license_path) as f:
        license_text = f.read().rstrip('\n')
      if config.get('licence_url'):
        licence_url = config['licence_url']
        if licence_url in open_data_licenses or licence_url in some_rights_reserved_licenses:
          if not terms.get(licence_url) and not terms_re.get(licence_url):
            warn('No LICENSE.txt template for License URL %s' % licence_url, slug)
          elif terms.get(licence_url) and license_text != terms[licence_url] or terms_re.get(licence_url) and not terms_re[licence_url].search(license_text):
            print('%-50s Expected LICENSE.txt to match license-specific template' % slug)
        elif licence_url in all_rights_reserved_licenses:
          if not all_rights_reserved_terms_re.search(license_text):
            print('%-50s Expected LICENSE.txt to match "all rights reserved" template' % slug)
        else:
          print('%-50s Unrecognized License URL %s' % (slug, licence_url))
      elif not all_rights_reserved_terms_re.search(license_text):
        print('%-50s Expected LICENSE.txt to match "all rights reserved" template' % slug)

    # Check for invalid keys, non-unique or empty values.
    invalid_keys = set(config.keys()) - valid_keys
    if invalid_keys:
      print('%-50s Unrecognized key: %s' % (slug, ', '.join(invalid_keys)))
    values = [value for key, value in config.items() if key != 'metadata']
    if len(values) > len(set(values)):
      print('%-50s Non-unique values' % slug)
    for key, value in config.items():
      if not value:
        print('%-50s Empty value for %s' % (slug, key))

    # Check for missing required keys.
    for key in ('domain', 'last_updated', 'name_func', 'authority', 'encoding'):
      if not config.get(key):
        print('%-50s Missing %s' % (slug, key))
    if not config.get('source_url') and config.get('data_url'):
      print('%-50s Missing source_url' % slug)
    if config.get('source_url') and not config.get('licence_url') and not config.get('data_url'):
      print('%-50s Missing licence_url or data_url' % slug)

    # Validate fields.
    for key in ('name', 'singular'):
      if config.get(key):
        print('%-50s Expected %s to be missing' % (slug, key))
    if config.get('encoding') and config['encoding'] != 'iso-8859-1':
      print('%-50s Expected encoding to be iso-8859-1 not %s' % (slug, config['encoding']))

    if slug not in ('Census divisions', 'Census subdivisions'):
      # Check for invalid keys or empty values.
      invalid_keys = set(config['metadata'].keys()) - valid_metadata_keys
      if invalid_keys:
        print('%-50s Unrecognized key: %s' % (slug, ', '.join(invalid_keys)))
      for key, value in config['metadata'].items():
        if not value:
          print('%-50s Empty value for %s' % (slug, key))

      division_id = get_division_id(slug, config)

      # Ensure division_id is unique.
      if division_id in division_ids:
        print('%-50s Duplicate division_id %s' % (slug, division_id))
      else:
        division_ids.add(division_id)

      expected_slug, expected_config = get_definition(division_id)

      # Check for unexpected values.
      assert_match(slug, 'slug', slug, expected_slug)
      for key, value in expected_config.items():
        assert_match(slug, key, config[key], value)
def spreadsheet(base='.', private_base='../represent-canada-private-data'):
  sgc_code_to_ocdid_map = sgc_code_to_ocdid()
  ocdid_to_sgc_code_map = {v: k for k, v in sgc_code_to_ocdid_map.items()}
  records = OrderedDict()

  reader = csv_reader('https://raw.github.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_municipal_subdivisions-has_children.csv')
  reader.next()
  for row in reader:
    municipal_subdivisions[row[0].split(':')[-1]] = row[1]

  urls = {}
  reader = csv_reader('https://raw.github.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_census_subdivisions-url.csv')
  reader.next()
  for row in reader:
    urls[row[0].split(':')[-1]] = row[1]

  abbreviations = {}
  reader = csv_reader('https://raw.github.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv')
  reader.next()
  for row in reader:
    abbreviations[row[2]] = row[0].split(':')[-1].upper()

  # Get scraper URLs.
  scraper_urls = {}
  for representative_set in requests.get('http://represent.opennorth.ca/representative-sets/?limit=0').json()['objects']:
    boundary_set_url = representative_set['related']['boundary_set_url']
    if boundary_set_url:
      if boundary_set_url != '/boundary-sets/census-subdivisions/':
        boundary_set = requests.get('http://represent.opennorth.ca%s' % boundary_set_url).json()
        if boundary_set.get('extra') and boundary_set['extra'].get('geographic_code'):
          scraper_urls[boundary_set['extra']['geographic_code']] = representative_set['data_about_url'] or representative_set['data_url']
        else:
          sys.stderr.write('%-60s No extra\n' % boundary_set_url)
    else:
      sys.stderr.write('%-60s No boundary_set_url\n' % representative_set['url'])

  # Create records for provinces and territories.
  reader = csv_reader('https://raw.github.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv')
  reader.next()
  for row in reader:
    records[ocdid_to_sgc_code_map[row[0]]] = {
      'OCD': row[0],
      'Geographic code': ocdid_to_sgc_code_map[row[0]],
      'Geographic name': row[1],
      'Geographic type': '',
      'Province or territory': row[0].split(':')[-1].upper(),
      'Population': '',
      'URL': '',
      'Scraper?': scraper_urls.get(ocdid_to_sgc_code_map[row[0]], ''),
      'Shapefile?': '',
      'Contact': '',
      'Request notes': '',
      'Received via': '',
      'Last boundary': '',
      'Next boundary': '',
      'Permission to distribute': '',
      'Highrise URL': '',
      'Type of license': '',
      'License URL': '',
      'Denial notes': '',
    }

  # Create records for census subdivisions.
  reader = csv_reader('http://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=301&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-301.CSV')
  reader.next()  # title
  reader.next()  # headers
  for row in reader:
    if row:
      result = re.search('\A(.+) \((.+)\)\Z', row[1].decode('iso-8859-1'))
      if result:
        name = result.group(1)
        province_or_territory = abbreviations[result.group(2)]
      elif row[1] == 'Canada':
        name = 'Canada'
        province_or_territory = ''
      else:
        raise Exception('Unrecognized name "%s"' % row[1])

      record = {
        'OCD': sgc_code_to_ocdid_map[row[0]],
        'Geographic code': row[0],
        'Geographic name': name,
        'Province or territory': province_or_territory,
        'Geographic type': row[2].decode('iso-8859-1'),
        'Population': row[4],
        'URL': urls.get(row[0], ''),
        'Scraper?': scraper_urls.get(row[0], ''),
        'Contact': '',
        'Request notes': '',
        'Received via': '',
        'Last boundary': '',
        'Next boundary': '',
        'Permission to distribute': '',
        'Highrise URL': '',
        'Type of license': '',
        'License URL': '',
        'Denial notes': '',
      }

      if municipal_subdivisions.get(row[0]):
        if municipal_subdivisions[row[0]] == 'N':
          for header in ['Shapefile?'] + request_headers + receipt_headers:
            record[header] = 'N/A'
        elif municipal_subdivisions[row[0]] == 'Y':
          record['Shapefile?'] = 'Request'
        elif municipal_subdivisions[row[0]] == '?':
          record['Shapefile?'] = ''
      else:
        record['Shapefile?'] = ''

      records[row[0]] = record
    else:
      break

  # Merge information from received data.
  for directory, permission_to_distribute in [(base, 'Y'), (private_base, 'N')]:
    boundaries.registry = {}
    for slug, config in registry(directory).items():
      if config.get('metadata'):
        geographic_code = config['metadata'].get('geographic_code')
        if geographic_code:
          license_path = os.path.join(dirname(config['file']), 'LICENSE.txt')
          license_text = ''
          if os.path.exists(license_path):
            with open(license_path) as f:
              license_text = f.read().rstrip('\n').decode('utf-8')

          record = records[geographic_code]
          record['Shapefile?'] = 'Y'
          record['Permission to distribute'] = permission_to_distribute
          record['License URL'] = config.get('licence_url', '')

          if config.get('last_updated'):
            record['Last boundary'] = config['last_updated'].strftime('%-m/%-d/%Y')

          if config.get('source_url'):
            record['Contact'] = 'N/A'
            record['Received via'] = 'online'
          else:
            match = all_rights_reserved_terms_re.search(license_text)
            if match:
              record['Contact'] = match.group(1)
            record['Received via'] = 'email'

          if config.get('licence_url'):
            licence_url = config['licence_url']
            if licence_url in open_data_licenses:
              record['Type of license'] = 'Open'
            elif licence_url in some_rights_reserved_licenses:
              record['Type of license'] = 'Most rights reserved'
            elif licence_url in all_rights_reserved_licenses:
              record['Type of license'] = 'All rights reserved'
            else:
              raise Exception(licence_url)
          elif all_rights_reserved_terms_re.search(license_text):
            record['Type of license'] = 'All rights reserved'
          else:
            record['Type of license'] = 'Custom'
        elif not config['metadata'].get('ocd_division'):
          sys.stderr.write('%-60s No geographic_code or ocd_division\n' % slug)

  reader = csv.DictReader(StringIO(requests.get('https://docs.google.com/spreadsheet/pub?key=0AtzgYYy0ZABtdGpJdVBrbWtUaEV0THNUd2JIZ1JqM2c&single=true&gid=25&output=csv').content))
  for row in reader:
    geographic_code = row['Geographic code']
    record = records[geographic_code]

    for key in row:
      a = record[key]
      b = row[key].decode('utf-8')

      if a != b:
        # Columns that are always tracked manually.
        # Scrapers for municipalities without wards are tracked manually.
        # In-progress requests are tracked manually.
        # Contacts for in-progress requests and private data are tracked manually.
        # We may have a contact to confirm the nonexistence of municipal subdivisions.
        # MFIPPA requests are tracked manually.
        # Additional details about license agreements and written consent are tracked manually.
        # We may have information for a bad shapefile from an in-progress request.
        if b and (
           (key in ('Highrise URL', 'Request notes', 'Next boundary', 'Denial notes')) or
           (key == 'Scraper?'         and not a         and record['Shapefile?'] == 'N/A') or
           (key == 'Shapefile?'       and not a         and b in ('Request', 'Requested')) or
           (key == 'Shapefile?'       and a == 'Request'and b == 'Requested') or
           (key == 'Contact'          and not a         and (row['Shapefile?'] == 'Requested' or record['Permission to distribute'] == 'N')) or
           (key == 'Contact'          and a == 'N/A'    and record['Shapefile?'] == 'N/A') or
           (key == 'Received via'     and a == 'email'  and b == 'MFIPPA') or
           (key == 'Type of license'  and '(' in b) or
           (key in ('Received via', 'Type of license', 'Permission to distribute') and not a and row['Shapefile?'] == 'Requested')):
          record[key] = b
        # The spreadsheet can add contacts and URLs.
        elif key != 'Population' and (key not in ('Contact', 'URL') or a):  # separators
          sys.stderr.write(u'%-25s %s: expected "%s" got "%s"\n' % (key, geographic_code, a, b))

  writer = UnicodeWriter(sys.stdout)
  writer.writerow(headers)
  for _, record in records.items():
    writer.writerow([record[header] for header in headers])
Esempio n. 3
0
def definitions(base='.'):
    """
    Check that all definition.py files are valid.
    """
    def warn(message, slug):
        if message not in seen:
            print('%-60s %s' % (slug, message))
            seen.add(message)

    def assert_match(slug, field, actual, expected):
        if isinstance(expected, re._pattern_type):
            if not expected.search(actual):
                print('%-60s Expected %s to match %s not %s' % (slug, field, expected.pattern, actual))
        elif isinstance(expected, list):
            if actual not in expected:
                print('%-60s Expected %s to be %s not %s' % (slug, field, expected[-1], actual))
        elif actual != expected and expected is not None:
            print('%-60s Expected %s to be %s not %s' % (slug, field, expected, actual))

    def has_multiple_sets(division_id):
        ocd_type = division_id.rsplit('/', 1)[1].split(':')[0]
        return ocd_type in ('country', 'province', 'territory') or division_id in (
            'ocd-division/country:ca/csd:3520005',  # Toronto
        )

    response = requests.get('https://docs.google.com/spreadsheets/d/1AmLQD2KwSpz3B4eStLUPmUQJmOOjRLI3ZUZSD5xUTWM/pub?gid=0&single=true&output=csv')
    response.encoding = 'utf-8'
    reader = csv.DictReader(StringIO(response.text))
    licenses_with_templates = set(filter(None, (row['License URL'] for row in reader)))
    licenses_with_templates.update(more_licenses_with_templates)

    seen = set()
    division_ids = set()
    for slug, config in registry(base).items():
        directory = dirname(config['file'])

        if config.get('extra'):
            division_id = config['extra']['division_id']
        else:
            division_id = None

        # Validate LICENSE.txt.
        license_path = os.path.join(directory, 'LICENSE.txt')
        if os.path.exists(license_path):
            with open(license_path) as f:
                license_text = f.read().rstrip('\n')
            if 'licence_url' in config:
                licence_url = config['licence_url']
                if licence_url in licenses_with_templates:
                    if licence_url not in terms and not terms_re.get(licence_url):
                        warn('No LICENSE.txt template for License URL %s' % licence_url, slug)
                    elif licence_url in terms and license_text != (terms[licence_url] % licence_url) or terms_re.get(licence_url) and not terms_re[licence_url].search(license_text):
                        print('%-60s Expected LICENSE.txt to match license-specific template' % slug)
                elif licence_url in all_rights_reserved_licenses:
                    if not all_rights_reserved_terms_re.search(license_text):
                        print('%-60s Expected LICENSE.txt to match "all rights reserved" template' % slug)
                else:
                    print('%-60s Unrecognized License URL %s' % (slug, licence_url))
            elif not all_rights_reserved_terms_re.search(license_text):
                print('%-60s Expected LICENSE.txt to match "all rights reserved" template' % slug)

        # Check for invalid keys, non-unique or empty values.
        invalid_keys = set(config.keys()) - valid_keys
        if invalid_keys:
            print('%-60s Unrecognized key: %s' % (slug, ', '.join(invalid_keys)))
        values = [value for key, value in config.items() if key != 'extra']
        if len(values) > len(set(values)):
            print('%-60s Non-unique values' % slug)
        for key, value in config.items():
            if not value:
                print('%-60s Empty value for %s' % (slug, key))

        # Check for missing required keys.
        for key in ('domain', 'last_updated', 'name_func', 'authority', 'encoding'):
            if key not in config:
                print('%-60s Missing %s' % (slug, key))
        if 'source_url' not in config and 'data_url' in config:
            print('%-60s Missing source_url' % slug)
        if 'source_url' in config and 'licence_url' not in config and 'data_url' not in config:
            print('%-60s Missing licence_url or data_url' % slug)

        # Validate fields.
        if 'name' in config:
            print('%-60s Expected name to be missing' % slug)
        if 'singular' in config and not slug.endswith(')') and not has_multiple_sets(division_id):
            print('%-60s Expected singular to be missing' % slug)

        if slug not in ('Census divisions', 'Census subdivisions'):
            # Check for invalid keys or empty values.
            invalid_keys = set(config['extra'].keys()) - {'division_id'}
            if invalid_keys:
                print('%-60s Unrecognized key: %s' % (slug, ', '.join(invalid_keys)))
            for key, value in config['extra'].items():
                if not value:
                    print('%-60s Empty value for %s' % (slug, key))

            # Ensure division_id is unique.
            if division_id in division_ids and division_id not in divisions_with_boroughs() and not has_multiple_sets(division_id):
                print('%-60s Duplicate division_id %s' % (slug, division_id))
            else:
                division_ids.add(division_id)

            expected_slug, expected_config = get_definition(division_id, path=config['file'])

            # Check for unexpected values.
            if not slug.endswith(')'):
                assert_match(slug, 'slug', slug, expected_slug)
            for key, value in expected_config.items():
                assert_match(slug, key, config[key], value)
Esempio n. 4
0
def spreadsheet(base='.', private_base='../represent-canada-private-data'):
    """
    Validate the spreadsheet for tracking progress on data collection.
    """
    expecteds = OrderedDict()

    # Append to `municipal_subdivisions` from `constants.py`.
    for division in Division.all('ca', from_csv=ocd_division_csv):
        if division.attrs['has_children']:
            municipal_subdivisions[type_id(division.id)] = division.attrs['has_children']

    expecteds['ocd-division/country:ca'] = default_expectation.copy()
    expecteds['ocd-division/country:ca'].update({
        'OCD': 'ocd-division/country:ca',
        'Geographic name': 'Canada',
    })

    # Create expectations for provinces and territories.
    for division in Division.all('ca', from_csv=ocd_division_csv):
        if division._type in ('province', 'territory'):
            expected = default_expectation.copy()
            expected.update({
                'OCD': division.id,
                'Geographic name': division.name,
                'Province or territory': type_id(division.id).upper(),
            })

            expecteds[division.id] = expected

    # Create expectations for census subdivisions.
    reader = csv_dict_reader('http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=301&OFT=FULLCSV', 'ISO-8859-1')
    for row in reader:
        code = row['Geographic code']

        if code == 'Note:':
            break

        division = Division.get('ocd-division/country:ca/csd:%s' % row['Geographic code'], from_csv=ocd_division_csv)

        expected = default_expectation.copy()
        expected.update({
            'OCD': division.id,
            'Geographic name': division.name,
            'Province or territory': province_or_territory_abbreviation(division.id),
            'Geographic type': division.attrs['classification'],
            'Population': row['Population, 2016'],
        })

        if code in municipal_subdivisions:
            if municipal_subdivisions[code] == 'N':
                expected['Shapefile?'] = 'N/A'
                expected['Contact'] = 'N/A'
                expected['Received via'] = 'N/A'
                expected['Last boundary'] = 'N/A'
                expected['Permission to distribute'] = 'N/A'
            elif municipal_subdivisions[code] == 'Y':
                expected['Shapefile?'] = 'Request'

        expecteds[division.id] = expected

    # Merge information from received data.
    for directory, permission_to_distribute in [(base, 'Y'), (private_base, 'N')]:
        boundaries.registry = {}
        for slug, config in registry(directory).items():
            if 'extra' in config:
                division_id = config['extra']['division_id']
                if division_id in expecteds:
                    license_path = os.path.join(dirname(config['file']), 'LICENSE.txt')
                    license_text = ''
                    if os.path.exists(license_path):
                        with open(license_path) as f:
                            license_text = f.read().rstrip('\n')

                    expected = expecteds[division_id]
                    expected['Shapefile?'] = 'Y'
                    expected['Permission to distribute'] = permission_to_distribute

                    if 'data_url' in config:
                        expected['Last boundary'] = 'N/A'
                    elif 'last_updated' in config:
                        expected['Last boundary'] = config['last_updated'].strftime('%-m/%-d/%Y')

                    if 'source_url' in config:
                        expected['Contact'] = 'N/A'
                        expected['Received via'] = 'online'
                    elif expected['Province or territory'] == 'SK' and expected['Geographic type'] == 'RM':
                        expected['Contact'] = 'Mikael Nagel\nMapping Technician\nTel: 1-306-569-2988 x205\nToll free: 1-800-663-6864\[email protected]'
                        expected['Received via'] = 'purchase'
                    else:
                        match = all_rights_reserved_terms_re.search(license_text)
                        if match:
                            expected['Contact'] = match.group(1)
                        expected['Received via'] = 'email'
                # The spreadsheet doesn't track borough boundaries.
                elif '/borough:' not in division_id:
                    sys.stderr.write('%-25s no record\n' % division_id)
            else:
                sys.stderr.write('%-25s no extra\n' % slug)

    reader = csv_dict_reader('https://docs.google.com/spreadsheets/d/1ihCIDc9EtvxF7kzPg3Yk6e928DN7JzaycH92IBYr0QU/pub?gid=25&single=true&output=csv', 'utf-8')

    actuals = set()
    for actual in reader:
        expected = expecteds[actual['OCD']]
        actuals.add(actual['OCD'])

        for key in actual.keys() - ('Population', 'URL', 'Request notes', 'Next boundary', 'Response notes'):
            e = expected[key]
            a = actual[key]

            # Note: Some of the following conditions are repetitive for readability.
            if e != a:
                # Change expectations for in-progress requests.
                if expected['Shapefile?'] == 'Request' and actual['Shapefile?'] == 'Requested' and (
                   # Request sent.
                   (key == 'Shapefile?' and e == 'Request' and a == 'Requested') or
                   # Contact found.
                   (key == 'Contact' and not e and a)):
                    continue

                # Some provinces and territories have missing expectations, in which case we defer to the spreadsheet.
                elif actual['Province or territory'] in ('ON', 'MB') and not expected['Shapefile?'] and (
                   # We determined that we needed to request boundaries.
                   (key == 'Shapefile?' and not e and a in ('Request', 'Requested')) or
                   # We determined that we needed to request boundaries, and did so.
                   (actual['Shapefile?'] == 'Requested' and key == 'Contact' and not e and a)):
                    continue

                # Contacts for private data are only stored in the spreadsheet.
                elif ((expected['Permission to distribute'] == 'N' and key == 'Contact' and not e and a) or
                      # MFIPPA receptions are only stored in the spreadsheet.
                      (key == 'Received via' and e == 'email' and a == 'MFIPPA')):
                    continue

                sys.stderr.write('%-25s %s: expected "%s" got "%s"\n' % (key, actual['OCD'], e, a))

    for division_id in expecteds.keys() - actuals:
        record = expecteds[division_id]
        if record['Shapefile?'] != 'N/A':
            sys.stderr.write('%s\t%s\t%s\t%s\t%s\n' % (division_id, record['Geographic name'], record['Province or territory'], record['Geographic type'], record['Population']))

    for division_id in actuals - expecteds.keys():
        sys.stderr.write('Remove %s\n' % division_id)
Esempio n. 5
0
def spreadsheet(base='.', private_base='../represent-canada-private-data'):
    """
    Validate the spreadsheet for tracking progress on data collection.
    """
    expecteds = OrderedDict()

    # Append to `municipal_subdivisions` from `constants.py`.
    for division in Division.all('ca', from_csv=ocd_division_csv):
        if division.attrs['has_children']:
            municipal_subdivisions[type_id(
                division.id)] = division.attrs['has_children']

    expecteds['ocd-division/country:ca'] = default_expectation.copy()
    expecteds['ocd-division/country:ca'].update({
        'OCD': 'ocd-division/country:ca',
        'Geographic name': 'Canada',
    })

    # Create expectations for provinces and territories.
    for division in Division.all('ca', from_csv=ocd_division_csv):
        if division._type in ('province', 'territory'):
            expected = default_expectation.copy()
            expected.update({
                'OCD':
                division.id,
                'Geographic name':
                division.name,
                'Province or territory':
                type_id(division.id).upper(),
            })

            expecteds[division.id] = expected

    # Create expectations for census subdivisions.
    reader = csv_dict_reader(
        'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=301&OFT=FULLCSV',
        'ISO-8859-1')
    for row in reader:
        code = row['Geographic code']

        if code == 'Note:':
            break

        division = Division.get('ocd-division/country:ca/csd:%s' %
                                row['Geographic code'],
                                from_csv=ocd_division_csv)

        expected = default_expectation.copy()
        expected.update({
            'OCD':
            division.id,
            'Geographic name':
            division.name,
            'Province or territory':
            province_or_territory_abbreviation(division.id),
            'Geographic type':
            division.attrs['classification'],
            'Population':
            row['Population, 2016'],
        })

        if code in municipal_subdivisions:
            if municipal_subdivisions[code] == 'N':
                expected['Shapefile?'] = 'N/A'
                expected['Contact'] = 'N/A'
                expected['Received via'] = 'N/A'
                expected['Last boundary'] = 'N/A'
                expected['Permission to distribute'] = 'N/A'
            elif municipal_subdivisions[code] == 'Y':
                expected['Shapefile?'] = 'Request'

        expecteds[division.id] = expected

    # Merge information from received data.
    for directory, permission_to_distribute in [(base, 'Y'),
                                                (private_base, 'N')]:
        boundaries.registry = {}
        for slug, config in registry(directory).items():
            if 'extra' in config:
                division_id = config['extra']['division_id']
                if division_id in expecteds:
                    license_path = os.path.join(dirname(config['file']),
                                                'LICENSE.txt')
                    license_text = ''
                    if os.path.exists(license_path):
                        with open(license_path) as f:
                            license_text = f.read().rstrip('\n')

                    expected = expecteds[division_id]
                    expected['Shapefile?'] = 'Y'
                    expected[
                        'Permission to distribute'] = permission_to_distribute

                    if 'data_url' in config:
                        expected['Last boundary'] = 'N/A'
                    elif 'last_updated' in config:
                        expected['Last boundary'] = config[
                            'last_updated'].strftime('%-m/%-d/%Y')

                    if 'source_url' in config:
                        expected['Contact'] = 'N/A'
                        expected['Received via'] = 'online'
                    elif expected['Province or territory'] == 'SK' and expected[
                            'Geographic type'] == 'RM':
                        expected[
                            'Contact'] = 'Mikael Nagel\nMapping Technician\nTel: 1-306-569-2988 x205\nToll free: 1-800-663-6864\[email protected]'
                        expected['Received via'] = 'purchase'
                    else:
                        match = all_rights_reserved_terms_re.search(
                            license_text)
                        if match:
                            expected['Contact'] = match.group(1)
                        expected['Received via'] = 'email'
                # The spreadsheet doesn't track borough boundaries.
                elif '/borough:' not in division_id:
                    sys.stderr.write('%-25s no record\n' % division_id)
            else:
                sys.stderr.write('%-25s no extra\n' % slug)

    reader = csv_dict_reader(
        'https://docs.google.com/spreadsheets/d/1ihCIDc9EtvxF7kzPg3Yk6e928DN7JzaycH92IBYr0QU/pub?gid=25&single=true&output=csv',
        'utf-8')

    actuals = set()
    for actual in reader:
        expected = expecteds[actual['OCD']]
        actuals.add(actual['OCD'])

        for key in actual.keys() - ('Population', 'URL', 'Request notes',
                                    'Next boundary', 'Response notes'):
            e = expected[key]
            a = actual[key]

            # Note: Some of the following conditions are repetitive for readability.
            if e != a:
                # Change expectations for in-progress requests.
                if expected['Shapefile?'] == 'Request' and actual[
                        'Shapefile?'] == 'Requested' and (
                            # Request sent.
                            (key == 'Shapefile?' and e == 'Request'
                             and a == 'Requested') or
                            # Contact found.
                            (key == 'Contact' and not e and a)):
                    continue

                # Some provinces and territories have missing expectations, in which case we defer to the spreadsheet.
                elif actual['Province or territory'] in (
                        'ON', 'MB'
                ) and not expected['Shapefile?'] and (
                        # We determined that we needed to request boundaries.
                    (key == 'Shapefile?' and not e
                     and a in ('Request', 'Requested')) or
                        # We determined that we needed to request boundaries, and did so.
                    (actual['Shapefile?'] == 'Requested' and key == 'Contact'
                     and not e and a)):
                    continue

                # Contacts for private data are only stored in the spreadsheet.
                elif ((expected['Permission to distribute'] == 'N'
                       and key == 'Contact' and not e and a) or
                      # MFIPPA receptions are only stored in the spreadsheet.
                      (key == 'Received via' and e == 'email' and a == 'MFIPPA'
                       )):
                    continue

                sys.stderr.write('%-25s %s: expected "%s" got "%s"\n' %
                                 (key, actual['OCD'], e, a))

    for division_id in expecteds.keys() - actuals:
        record = expecteds[division_id]
        if record['Shapefile?'] != 'N/A':
            sys.stderr.write('%s\t%s\t%s\t%s\t%s\n' %
                             (division_id, record['Geographic name'],
                              record['Province or territory'],
                              record['Geographic type'], record['Population']))

    for division_id in actuals - expecteds.keys():
        sys.stderr.write('Remove %s\n' % division_id)
Esempio n. 6
0
def definitions(base='.'):
    """
    Check that all definition.py files are valid.
    """
    def warn(message, slug):
        if message not in seen:
            print('%-60s %s' % (slug, message))
            seen.add(message)

    def assert_match(slug, field, actual, expected):
        if isinstance(expected, re._pattern_type):
            if not expected.search(actual):
                print('%-60s Expected %s to match %s not %s' %
                      (slug, field, expected.pattern, actual))
        elif isinstance(expected, list):
            if actual not in expected:
                print('%-60s Expected %s to be %s not %s' %
                      (slug, field, expected[-1], actual))
        elif actual != expected and expected is not None:
            print('%-60s Expected %s to be %s not %s' %
                  (slug, field, expected, actual))

    def has_multiple_sets(division_id):
        ocd_type = division_id.rsplit('/', 1)[1].split(':')[0]
        return ocd_type in (
            'country', 'province', 'territory') or division_id in (
                'ocd-division/country:ca/csd:3520005',  # Toronto
            )

    response = requests.get(
        'https://docs.google.com/spreadsheets/d/1AmLQD2KwSpz3B4eStLUPmUQJmOOjRLI3ZUZSD5xUTWM/pub?gid=0&single=true&output=csv'
    )
    response.encoding = 'utf-8'
    reader = csv.DictReader(StringIO(response.text))
    licenses_with_templates = set(
        filter(None, (row['License URL'] for row in reader)))
    licenses_with_templates.update(more_licenses_with_templates)

    seen = set()
    division_ids = set()
    for slug, config in registry(base).items():
        directory = dirname(config['file'])

        if config.get('extra'):
            division_id = config['extra']['division_id']
        else:
            division_id = None

        # Validate LICENSE.txt.
        license_path = os.path.join(directory, 'LICENSE.txt')
        if os.path.exists(license_path):
            with open(license_path) as f:
                license_text = f.read().rstrip('\n')
            if 'licence_url' in config:
                licence_url = config['licence_url']
                if licence_url in licenses_with_templates:
                    if licence_url not in terms and not terms_re.get(
                            licence_url):
                        warn(
                            'No LICENSE.txt template for License URL %s' %
                            licence_url, slug)
                    elif licence_url in terms and license_text != (
                            terms[licence_url] % licence_url) or terms_re.get(
                                licence_url) and not terms_re[
                                    licence_url].search(license_text):
                        print(
                            '%-60s Expected LICENSE.txt to match license-specific template'
                            % slug)
                elif licence_url in all_rights_reserved_licenses:
                    if not all_rights_reserved_terms_re.search(license_text):
                        print(
                            '%-60s Expected LICENSE.txt to match "all rights reserved" template'
                            % slug)
                else:
                    print('%-60s Unrecognized License URL %s' %
                          (slug, licence_url))
            elif not all_rights_reserved_terms_re.search(license_text):
                print(
                    '%-60s Expected LICENSE.txt to match "all rights reserved" template'
                    % slug)

        # Check for invalid keys, non-unique or empty values.
        invalid_keys = set(config.keys()) - valid_keys
        if invalid_keys:
            print('%-60s Unrecognized key: %s' %
                  (slug, ', '.join(invalid_keys)))
        values = [value for key, value in config.items() if key != 'extra']
        if len(values) > len(set(values)):
            print('%-60s Non-unique values' % slug)
        for key, value in config.items():
            if not value:
                print('%-60s Empty value for %s' % (slug, key))

        # Check for missing required keys.
        for key in ('domain', 'last_updated', 'name_func', 'authority',
                    'encoding'):
            if key not in config:
                print('%-60s Missing %s' % (slug, key))
        if 'source_url' not in config and 'data_url' in config:
            print('%-60s Missing source_url' % slug)
        if 'source_url' in config and 'licence_url' not in config and 'data_url' not in config:
            print('%-60s Missing licence_url or data_url' % slug)

        # Validate fields.
        if 'name' in config:
            print('%-60s Expected name to be missing' % slug)
        if 'singular' in config and not slug.endswith(
                ')') and not has_multiple_sets(division_id):
            print('%-60s Expected singular to be missing' % slug)

        if slug not in ('Census divisions', 'Census subdivisions'):
            # Check for invalid keys or empty values.
            invalid_keys = set(config['extra'].keys()) - {'division_id'}
            if invalid_keys:
                print('%-60s Unrecognized key: %s' %
                      (slug, ', '.join(invalid_keys)))
            for key, value in config['extra'].items():
                if not value:
                    print('%-60s Empty value for %s' % (slug, key))

            # Ensure division_id is unique.
            if division_id in division_ids and division_id not in divisions_with_boroughs(
            ) and not has_multiple_sets(division_id):
                print('%-60s Duplicate division_id %s' % (slug, division_id))
            else:
                division_ids.add(division_id)

            expected_slug, expected_config = get_definition(
                division_id, path=config['file'])

            # Check for unexpected values.
            if not slug.endswith(')'):
                assert_match(slug, 'slug', slug, expected_slug)
            for key, value in expected_config.items():
                assert_match(slug, key, config[key], value)
def spreadsheet(base='.', private_base='../represent-canada-private-data'):
    sgc_code_to_ocdid_map = sgc_code_to_ocdid()
    ocdid_to_sgc_code_map = {v: k for k, v in sgc_code_to_ocdid_map.items()}
    records = OrderedDict()

    reader = csv_reader(
        'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_municipal_subdivisions-has_children.csv'
    )
    next(reader)
    for row in reader:
        municipal_subdivisions[row[0].split(':')[-1]] = row[1]

    urls = {}
    reader = csv_reader(
        'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_census_subdivisions-url.csv'
    )
    next(reader)
    for row in reader:
        urls[row[0].split(':')[-1]] = row[1]

    abbreviations = {}
    reader = csv_reader(
        'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv'
    )
    next(reader)
    for row in reader:
        abbreviations[row[2]] = row[0].split(':')[-1].upper()

    # Create records for provinces and territories.
    reader = csv_reader(
        'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv'
    )
    next(reader)
    for row in reader:
        records[ocdid_to_sgc_code_map[row[0]]] = {
            'OCD': row[0],
            'Geographic code': ocdid_to_sgc_code_map[row[0]],
            'Geographic name': row[1],
            'Geographic type': '',
            'Province or territory': row[0].split(':')[-1].upper(),
            'Population': '',
            'URL': '',
            'Shapefile?': '',
            'Contact': '',
            'Request notes': '',
            'Received via': '',
            'Last boundary': '',
            'Next boundary': '',
            'Permission to distribute': '',
            'Highrise URL': '',
            'Type of license': '',
            'License URL': '',
            'Denial notes': '',
        }

    # Create records for census subdivisions.
    reader = csv_reader(
        'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=301&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-301.CSV',
        'ISO-8859-1')
    next(reader)  # title
    next(reader)  # headers
    for row in reader:
        if row:
            result = re.search('\A(.+) \((.+)\)\Z', row[1])
            if result:
                name = result.group(1)
                province_or_territory = abbreviations[result.group(2)]
            elif row[1] == 'Canada':
                name = 'Canada'
                province_or_territory = ''
            else:
                raise Exception('Unrecognized name "%s"' % row[1])

            record = {
                'OCD': sgc_code_to_ocdid_map[row[0]],
                'Geographic code': row[0],
                'Geographic name': name,
                'Province or territory': province_or_territory,
                'Geographic type': row[2],
                'Population': row[4],
                'URL': urls.get(row[0], ''),
                'Contact': '',
                'Request notes': '',
                'Received via': '',
                'Last boundary': '',
                'Next boundary': '',
                'Permission to distribute': '',
                'Highrise URL': '',
                'Type of license': '',
                'License URL': '',
                'Denial notes': '',
            }

            if municipal_subdivisions.get(row[0]):
                if municipal_subdivisions[row[0]] == 'N':
                    for header in ['Shapefile?'
                                   ] + request_headers + receipt_headers:
                        record[header] = 'N/A'
                elif municipal_subdivisions[row[0]] == 'Y':
                    record['Shapefile?'] = 'Request'
                elif municipal_subdivisions[row[0]] == '?':
                    record['Shapefile?'] = ''
            else:
                record['Shapefile?'] = ''

            records[row[0]] = record
        else:
            break

    # Merge information from received data.
    for directory, permission_to_distribute in [(base, 'Y'),
                                                (private_base, 'N')]:
        boundaries.registry = {}
        for slug, config in registry(directory).items():
            if config.get('metadata'):
                geographic_code = config['metadata'].get('geographic_code')
                if geographic_code:
                    license_path = os.path.join(dirname(config['file']),
                                                'LICENSE.txt')
                    license_text = ''
                    if os.path.exists(license_path):
                        with open(license_path) as f:
                            license_text = f.read().rstrip('\n')

                    record = records[geographic_code]
                    record['Shapefile?'] = 'Y'
                    record[
                        'Permission to distribute'] = permission_to_distribute
                    record['License URL'] = config.get('licence_url', '')

                    if config.get('last_updated'):
                        record['Last boundary'] = config[
                            'last_updated'].strftime('%-m/%-d/%Y')

                    if config.get('source_url'):
                        record['Contact'] = 'N/A'
                        record['Received via'] = 'online'
                    else:
                        match = all_rights_reserved_terms_re.search(
                            license_text)
                        if match:
                            record['Contact'] = match.group(1)
                        record['Received via'] = 'email'

                    if config.get('licence_url'):
                        licence_url = config['licence_url']
                        if licence_url in open_data_licenses:
                            record['Type of license'] = 'Open'
                        elif licence_url in some_rights_reserved_licenses:
                            record['Type of license'] = 'Most rights reserved'
                        elif licence_url in all_rights_reserved_licenses:
                            record['Type of license'] = 'All rights reserved'
                        else:
                            raise Exception(licence_url)
                    elif all_rights_reserved_terms_re.search(license_text):
                        record['Type of license'] = 'All rights reserved'
                    else:
                        record['Type of license'] = 'Custom'
                elif not config['metadata'].get('ocd_division'):
                    sys.stderr.write(
                        '%-60s No geographic_code or ocd_division\n' % slug)
            else:
                sys.stderr.write('%-60s No metadata\n' % slug)

    response = requests.get(
        'https://docs.google.com/spreadsheet/pub?key=0AtzgYYy0ZABtdGpJdVBrbWtUaEV0THNUd2JIZ1JqM2c&single=true&gid=25&output=csv'
    )
    response.encoding = 'utf-8'
    reader = csv.DictReader(StringIO(response.text))
    for row in reader:
        geographic_code = row['Geographic code']
        record = records[geographic_code]

        for key in row:
            a = record[key]
            b = row[key]

            if a != b:
                # Columns that are always tracked manually.
                # Scrapers for municipalities without wards are tracked manually.
                # In-progress requests are tracked manually.
                # Contacts for in-progress requests and private data are tracked manually.
                # We may have a contact to confirm the nonexistence of municipal subdivisions.
                # MFIPPA requests are tracked manually.
                # Additional details about license agreements and written consent are tracked manually.
                # We may have information for a bad shapefile from an in-progress request.
                if b and ((key in ('Highrise URL', 'Request notes',
                                   'Next boundary', 'Denial notes')) or
                          (key == 'Shapefile?' and not a
                           and b in ('Request', 'Requested')) or
                          (key == 'Shapefile?' and a == 'Request'
                           and b == 'Requested') or
                          (key == 'Contact' and not a and
                           (row['Shapefile?'] == 'Requested'
                            or record['Permission to distribute'] == 'N')) or
                          (key == 'Contact' and a == 'N/A'
                           and record['Shapefile?'] == 'N/A') or
                          (key == 'Received via' and a == 'email'
                           and b == 'MFIPPA') or
                          (key == 'Type of license' and '(' in b) or
                          (key in ('Received via', 'Type of license',
                                   'Permission to distribute') and not a
                           and row['Shapefile?'] == 'Requested')):
                    record[key] = b
                # The spreadsheet can add contacts and URLs.
                elif key != 'Population' and (key not in ('Contact', 'URL')
                                              or a):  # separators
                    sys.stderr.write('%-25s %s: expected "%s" got "%s"\n' %
                                     (key, geographic_code, a, b))

    writer = csv.writer(sys.stdout)
    writer.writerow(headers)
    for _, record in records.items():
        writer.writerow([record[header] for header in headers])
def definitions(base='.'):
    """
    Check that all definition.py files are valid.
    """
    def warn(message, slug):
        if message not in seen:
            print('%-50s %s' % (slug, message))
            seen.add(message)

    borough_division_ids = (
        'ocd-division/country:ca/csd:2458227',  # Longueuil
        'ocd-division/country:ca/csd:2466023',  # Montréal
        'ocd-division/country:ca/csd:2423027',  # Québec
        'ocd-division/country:ca/csd:2494068',  # Saguenay
        'ocd-division/country:ca/csd:2443027',  # Sherbrooke
    )

    seen = set()
    division_ids = set()
    for slug, config in registry(base).items():
        directory = dirname(config['file'])

        # Validate LICENSE.txt.
        license_path = os.path.join(directory, 'LICENSE.txt')
        if os.path.exists(license_path):
            with open(license_path) as f:
                license_text = f.read().rstrip('\n')
            if config.get('licence_url'):
                licence_url = config['licence_url']
                if licence_url in open_data_licenses or licence_url in some_rights_reserved_licenses:
                    if not terms.get(licence_url) and not terms_re.get(
                            licence_url):
                        warn(
                            'No LICENSE.txt template for License URL %s' %
                            licence_url, slug)
                    elif terms.get(
                            licence_url
                    ) and license_text != terms[licence_url] or terms_re.get(
                            licence_url
                    ) and not terms_re[licence_url].search(license_text):
                        print(
                            '%-50s Expected LICENSE.txt to match license-specific template'
                            % slug)
                elif licence_url in all_rights_reserved_licenses:
                    if not all_rights_reserved_terms_re.search(license_text):
                        print(
                            '%-50s Expected LICENSE.txt to match "all rights reserved" template'
                            % slug)
                else:
                    print('%-50s Unrecognized License URL %s' %
                          (slug, licence_url))
            elif not all_rights_reserved_terms_re.search(license_text):
                print(
                    '%-50s Expected LICENSE.txt to match "all rights reserved" template'
                    % slug)

        # Check for invalid keys, non-unique or empty values.
        invalid_keys = set(config.keys()) - valid_keys
        if invalid_keys:
            print('%-50s Unrecognized key: %s' %
                  (slug, ', '.join(invalid_keys)))
        values = [value for key, value in config.items() if key != 'metadata']
        if len(values) > len(set(values)):
            print('%-50s Non-unique values' % slug)
        for key, value in config.items():
            if not value:
                print('%-50s Empty value for %s' % (slug, key))

        # Check for missing required keys.
        for key in ('domain', 'last_updated', 'name_func', 'authority',
                    'encoding'):
            if not config.get(key):
                print('%-50s Missing %s' % (slug, key))
        if not config.get('source_url') and config.get('data_url'):
            print('%-50s Missing source_url' % slug)
        if config.get('source_url') and not config.get(
                'licence_url') and not config.get('data_url'):
            print('%-50s Missing licence_url or data_url' % slug)

        # Validate fields.
        for key in ('name', 'singular'):
            if config.get(key):
                print('%-50s Expected %s to be missing' % (slug, key))

        if slug not in ('Census divisions', 'Census subdivisions'):
            # Check for invalid keys or empty values.
            invalid_keys = set(config['metadata'].keys()) - valid_metadata_keys
            if invalid_keys:
                print('%-50s Unrecognized key: %s' %
                      (slug, ', '.join(invalid_keys)))
            for key, value in config['metadata'].items():
                if not value:
                    print('%-50s Empty value for %s' % (slug, key))

            division_id = get_division_id(slug, config)

            # Ensure division_id is unique.
            if division_id in division_ids and division_id not in borough_division_ids:
                print('%-50s Duplicate division_id %s' % (slug, division_id))
            else:
                division_ids.add(division_id)

            expected_slug, expected_config = get_definition(division_id)

            # Check for unexpected values.
            assert_match(slug, 'slug', slug, expected_slug)
            for key, value in expected_config.items():
                assert_match(slug, key, config[key], value)
Esempio n. 9
0
def spreadsheet(base='.', private_base='../represent-canada-private-data'):
    sgc_code_to_ocdid_map = sgc_code_to_ocdid()
    records = OrderedDict()

    reader = csv_reader(
        'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_municipal_subdivisions-has_children.csv'
    )
    next(reader)
    for row in reader:
        municipal_subdivisions[row[0].split(':')[-1]] = row[1]

    urls = {}
    reader = csv_reader(
        'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_census_subdivisions-url.csv'
    )
    next(reader)
    for row in reader:
        urls[row[0].split(':')[-1]] = row[1]

    abbreviations = {}
    reader = csv_reader(
        'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv'
    )
    next(reader)
    for row in reader:
        abbreviations[row[2]] = row[0].split(':')[-1].upper()

    # Create records for provinces and territories.
    reader = csv_reader(
        'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv'
    )
    next(reader)
    for row in reader:
        records[row[0]] = {
            'OCD': row[0],
            'Geographic name': row[1],
            'Geographic type': '',
            'Province or territory': row[0].split(':')[-1].upper(),
            'Population': '',
            'URL': '',
            'Shapefile?': '',
            'Contact': '',
            'Request notes': '',
            'Received via': '',
            'Last boundary': '',
            'Next boundary': '',
            'Permission to distribute': '',
            'Highrise URL': '',
            'Response notes': '',
        }

    # Create records for census subdivisions.
    reader = csv_reader(
        'http://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=301&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-301.CSV',
        'ISO-8859-1')
    next(reader)  # title
    next(reader)  # headers
    for row in reader:
        if row:
            result = re.search('\A(.+) \((.+)\)\Z', row[1])
            if result:
                name = result.group(1)
                province_or_territory = abbreviations[result.group(2)]
            elif row[1] == 'Canada':
                name = 'Canada'
                province_or_territory = ''
            else:
                raise Exception('Unrecognized name "%s"' % row[1])

            division_id = sgc_code_to_ocdid_map[row[0]]

            record = {
                'OCD': division_id,
                'Geographic name': name,
                'Province or territory': province_or_territory,
                'Geographic type': row[2],
                'Population': row[4],
                'URL': urls.get(row[0], ''),
                'Contact': '',
                'Request notes': '',
                'Received via': '',
                'Last boundary': '',
                'Next boundary': '',
                'Permission to distribute': '',
                'Highrise URL': '',
                'Response notes': '',
            }

            if row[0] in municipal_subdivisions:
                if municipal_subdivisions[row[0]] == 'N':
                    for header in ['Shapefile?'
                                   ] + request_headers + receipt_headers:
                        record[header] = 'N/A'
                elif municipal_subdivisions[row[0]] == 'Y':
                    record['Shapefile?'] = 'Request'
                elif municipal_subdivisions[row[0]] == '?':
                    record['Shapefile?'] = ''
            else:
                record['Shapefile?'] = ''

            records[division_id] = record
        else:
            break

    # Merge information from received data.
    for directory, permission_to_distribute in [(base, 'Y'),
                                                (private_base, 'N')]:
        boundaries.registry = {}
        for slug, config in registry(directory).items():
            if 'extra' in config:
                division_id = config['extra']['division_id']
                if division_id in records:
                    license_path = os.path.join(dirname(config['file']),
                                                'LICENSE.txt')
                    license_text = ''
                    if os.path.exists(license_path):
                        with open(license_path) as f:
                            license_text = f.read().rstrip('\n')

                    record = records[division_id]
                    record['Shapefile?'] = 'Y'
                    record[
                        'Permission to distribute'] = permission_to_distribute

                    if 'last_updated' in config:
                        record['Last boundary'] = config[
                            'last_updated'].strftime('%-m/%-d/%Y')

                    if 'source_url' in config:
                        record['Contact'] = 'N/A'
                        record['Received via'] = 'online'
                    else:
                        match = all_rights_reserved_terms_re.search(
                            license_text)
                        if match:
                            record['Contact'] = match.group(1)
                        record['Received via'] = 'email'
                # The spreadsheet doesn't track borough boundaries.
                elif '/borough:' not in division_id:
                    sys.stderr.write('%-25s no record\n' % division_id)
            else:
                sys.stderr.write('%-25s no extra\n' % slug)

    response = requests.get(
        'https://docs.google.com/spreadsheets/d/1ihCIDc9EtvxF7kzPg3Yk6e928DN7JzaycH92IBYr0QU/pub?gid=25&single=true&output=csv'
    )
    response.encoding = 'utf-8'
    reader = csv.DictReader(StringIO(response.text))
    for row in reader:
        record = records[row['OCD']]

        for key in row:
            a = record[key]
            b = row[key]

            if a != b:
                if b and (
                        # Columns that are always tracked manually.
                    (key in ('Highrise URL', 'Request notes', 'Next boundary',
                             'Response notes')) or
                        # Scrapers for municipalities without wards are tracked manually.
                    (key == 'Shapefile?' and not a
                     and b in ('Request', 'Requested')) or
                        # In-progress requests are tracked manually.
                    (key == 'Shapefile?' and a == 'Request'
                     and b == 'Requested') or
                        # Contacts for in-progress requests and private data are tracked manually.
                    (key == 'Contact' and not a and
                     (row['Shapefile?'] == 'Requested'
                      or record['Permission to distribute'] == 'N')) or
                        # We may have a contact to confirm the nonexistence of municipal subdivisions.
                    (key == 'Contact' and a == 'N/A'
                     and record['Shapefile?'] == 'N/A') or
                        # MFIPPA requests are tracked manually.
                    (key == 'Received via' and a == 'email' and b == 'MFIPPA')
                        or
                        # We may have information for a bad shapefile from an in-progress request.
                    (key in ('Received via', 'Permission to distribute')
                     and not a and row['Shapefile?'] == 'Requested')):
                    record[key] = b
                # The spreadsheet can add contacts and URLs.
                elif key != 'Population' and (key not in ('Contact', 'URL')
                                              or a):  # separators
                    sys.stderr.write('%-25s %s: expected "%s" got "%s"\n' %
                                     (key, row['OCD'], a, b))

    writer = csv.writer(sys.stdout)
    writer.writerow(headers)
    for _, record in records.items():
        writer.writerow([record[header] for header in headers])