def gen_blocks(self, feature): block_fields = {} tlid = feature.get('TLID') for field_key, feature_key in (('right_from_num', 'RFROMADD'), ('left_from_num', 'LFROMADD'), ('right_to_num', 'RTOADD'), ('left_to_num', 'LTOADD')): block_fields[field_key] = feature.get(feature_key) block_fields['right_zip'] = feature.get('ZIPR') block_fields['left_zip'] = feature.get('ZIPL') for side in ('right', 'left'): block_fields[side + '_city'] = self._get_city(feature, side[0].upper()).upper() block_fields[side + '_state'] = self._get_state(feature, side[0].upper()).upper() if tlid in self.featnames_db: suffix_standardizer = geocoder_parsing.STANDARDIZERS['suffix'] suffix_matcher = geocoder_parsing.TOKEN_REGEXES['suffix'] for featname in self.featnames_db[tlid]: # Prefix eg. 'STATE HWY'. block_fields['prefix'] = featname.get('PRETYPABRV', '').upper().strip() # Main part of the name, eg. 'MAIN' block_fields['street'] = featname['NAME'].upper().strip() # Prefix direction eg. 'N'. block_fields['predir'] = featname['PREDIRABRV'].upper().strip() # Suffix direction eg. 'SW'. block_fields['postdir'] = featname['SUFDIRABRV'].upper().strip() # Road type, eg. 'ST', 'AVE', 'PKWY'. block_fields['suffix'] = featname['SUFTYPABRV'].upper().strip() if not block_fields['suffix']: # Bug in the data: # Many streets named eg. 'Wilson Park' put the whole thing in the # name and nothing in the suffix. # This breaks our geocoder, because it parses 'Park' as the suffix # and expects to find it in that field. # So, check if the street name ends with a recognized suffix. if block_fields['street'].count(' '): name_parts = block_fields['street'].split() raw_suffix = name_parts.pop() street = ' '.join(name_parts) if suffix_matcher.match(raw_suffix): block_fields['suffix'] = suffix_standardizer(raw_suffix) block_fields['street'] = street # More bugs in data: some auxiliary roads have the prefix as # part of the name in nonstandard format. if not block_fields['prefix']: prefix, street = None, None if block_fields['street'].startswith('INTERSTATE '): prefix, street = block_fields['street'].split(' ', 1) elif block_fields['street'].startswith('I-'): prefix, street = block_fields['street'].split('-', 1) if prefix and street: logger.debug("Splitting prefix %r out of street %r" % (prefix, street)) block_fields['street'] = street.strip() block_fields['prefix'] = prefix.strip() yield block_fields.copy() self.tlids_with_blocks.add(tlid)
def skip_feature(self, feature): if self.filter_bounds: if not feature.geom.intersects(self.filter_bounds): logger.debug("Skipping %s, out of bounds" % feature) return True if not feature.get('MTFCC') in VALID_MTFCC: logger.debug("Skipping %s, not a valid feature type" % feature.get('MTFCC')) return True if self.filter_city: in_city = False for side in ('R', 'L'): if self._get_city(feature, side).upper() == self.filter_city: in_city = True if not in_city: logger.debug("Skipping %s, out of city" % feature) return True if not ( ((feature.get('RFROMADD') and feature.get('RTOADD')) or (feature.get('LFROMADD') and feature.get('LTOADD')))): logger.debug("Skipping %s, not enough address info" % feature) return True return False
def skip_feature(self, feature): if self.filter_bounds: if not feature.geom.intersects(self.filter_bounds): logger.debug("Skipping %s, out of bounds" % feature) return True if not feature.get('MTFCC') in VALID_MTFCC: logger.debug("Skipping %s, not a valid feature type" % feature.get('MTFCC')) return True if self.filter_city: in_city = False for side in ('R', 'L'): if self._get_city(feature, side).upper() == self.filter_city: in_city = True if not in_city: logger.debug("Skipping %s, out of city" % feature) return True if not (((feature.get('RFROMADD') and feature.get('RTOADD')) or (feature.get('LFROMADD') and feature.get('LTOADD')))): logger.debug("Skipping %s, not enough address info" % feature) return True return False
def _clean_featnames(self, featnames_dbf): rel_db = self._load_rel_db(featnames_dbf, 'TLID') featnames_db = defaultdict(list) for tlid, rows in rel_db.iteritems(): primary = None alternates = [] for row in rows: # TLID is Tiger/Line ID, unique per edge. # We use TLID instead of LINEARID as the key because # LINEARID is only unique per 'linear feature', which is # an implicit union of some edges. So if we used LINEARID, # we'd clobber a lot of keys in the call to # _load_rel_db(). # Fixes #14 ("missing blocks"). if row['MTFCC'] not in VALID_MTFCC: continue if not row.get('FULLNAME'): self.log("skipping tlid %r, no fullname" % tlid) continue if row['PAFLAG'] == 'P': primary = row featnames_db[tlid].append(row) else: alternates.append(row) # A lot of alternates seem to be duplicates of the primary name, # not useful. alternates = [ row for row in alternates if row['NAME'].upper() != primary['NAME'].upper() ] # For now we just log alternates that were found. Ideally we could save these # as aliases somehow, but at the moment we don't have a good way to do that. for alternate in alternates: correct = primary['NAME'].upper() incorrect = alternate['NAME'].upper() msg = 'Found alternate name for {0} ({1}): {2}\n{3}\n{4}' logger.debug( msg.format(correct, primary['TLID'], incorrect, pprint.pformat(primary), pprint.pformat(alternate))) return featnames_db
def _get_city(self, feature, side): city = '' if self.fix_cities: from ebpub.db.models import get_city_locations overlapping_cities = list(get_city_locations().filter(location__intersects=feature.geom.geos)) if overlapping_cities: city = overlapping_cities[0].name logger.debug("overriding city to %s" % city) else: fid = feature.get('TFID' + side) if fid in self.faces_db: face = self.faces_db[fid] # Handle both 2010 and older census files. # If none of these work, we simply get no city. pid = face.get('PLACEFP10') or face.get('PLACEFP00') or face.get('PLACEFP') if pid in self.places: place = self.places[pid] # Handle both 2010 and earlier Census files. city = place.get('NAME10') or place['NAME'] return city
def _clean_featnames(self, featnames_dbf): rel_db = self._load_rel_db(featnames_dbf, 'TLID') featnames_db = defaultdict(list) for tlid, rows in rel_db.iteritems(): primary = None alternates = [] for row in rows: # TLID is Tiger/Line ID, unique per edge. # We use TLID instead of LINEARID as the key because # LINEARID is only unique per 'linear feature', which is # an implicit union of some edges. So if we used LINEARID, # we'd clobber a lot of keys in the call to # _load_rel_db(). # Fixes #14 ("missing blocks"). if row['MTFCC'] not in VALID_MTFCC: continue if not row.get('FULLNAME'): self.log("skipping tlid %r, no fullname" % tlid) continue if row['PAFLAG'] == 'P': primary = row featnames_db[tlid].append(row) else: alternates.append(row) # A lot of alternates seem to be duplicates of the primary name, # not useful. alternates = [row for row in alternates if row['NAME'].upper() != primary['NAME'].upper()] # For now we just log alternates that were found. Ideally we could save these # as aliases somehow, but at the moment we don't have a good way to do that. for alternate in alternates: correct = primary['NAME'].upper() incorrect = alternate['NAME'].upper() msg = 'Found alternate name for {0} ({1}): {2}\n{3}\n{4}' logger.debug(msg.format(correct, primary['TLID'], incorrect, pprint.pformat(primary), pprint.pformat(alternate))) return featnames_db
def _get_city(self, feature, side): city = '' if self.fix_cities: from ebpub.db.models import get_city_locations overlapping_cities = list(get_city_locations().filter( location__intersects=feature.geom.geos)) if overlapping_cities: city = overlapping_cities[0].name logger.debug("overriding city to %s" % city) else: fid = feature.get('TFID' + side) if fid in self.faces_db: face = self.faces_db[fid] # Handle both 2010 and older census files. # If none of these work, we simply get no city. pid = face.get('PLACEFP10') or face.get( 'PLACEFP00') or face.get('PLACEFP') if pid in self.places: place = self.places[pid] # Handle both 2010 and earlier Census files. city = place.get('NAME10') or place['NAME'] return city
def main(argv=None): if argv is None: argv = sys.argv[1:] parser = optparse.OptionParser(usage='%prog edges.shp featnames.dbf faces.dbf place.shp') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False) parser.add_option('-c', '--city', dest='city', help='A city name to filter against') parser.add_option('-f', '--fix-cities', action="store_true", default=False, help='Whether to override "city" attribute of blocks and ' 'streets by finding an intersecting Location of a city-ish ' 'type. Only makes sense if you have configured ' 'multiple_cities=True in the METRO_LIST of your settings.py, ' 'and after you have created some appropriate Locations.') parser.add_option('-b', '--filter-bounds', action="store", default=1, type='int', help='Whether to skip blocks outside the metro extent from your ' 'settings.py. Default 1 (true); use 0 to disable.') parser.add_option('-l', '--filter-location', action="append", help='A location (spelled as location-type-slug:location-slug) ' 'that will be used to filter out blocks outside its boundaries. ' 'May be passed more than once.' ) parser.add_option('-e', '--encoding', dest='encoding', help='Encoding to use when reading the shapefile', default='utf8') (options, args) = parser.parse_args(argv) if len(args) != 4: return parser.error('must provide 4 arguments, see usage') if options.filter_bounds: from ebpub.utils.geodjango import get_default_bounds filter_bounds = get_default_bounds() else: filter_bounds = None # Optionally filter on bounds of some Locations too. loc_bounds = None for locslug in options.filter_location or []: typeslug, locslug = locslug.split(':', 1) from ebpub.db.models import Location location = Location.objects.get(location_type__slug=typeslug, slug=locslug) if loc_bounds is None: loc_bounds = location.location else: loc_bounds = loc_bounds.union(location.location) if None not in (filter_bounds, loc_bounds): filter_bounds = filter_bounds.intersection(loc_bounds) elif loc_bounds is not None: filter_bounds = loc_bounds else: filter_bounds = filter_bounds tiger = TigerImporter(*args, verbose=options.verbose, filter_city=options.city, filter_bounds=filter_bounds, encoding=options.encoding, fix_cities=options.fix_cities) if options.verbose: import logging logger.setLevel(logging.DEBUG) num_created = tiger.save() logger.info( "Created %d new blocks" % num_created) logger.debug("... from %d feature names" % len(tiger.featnames_db)) logger.debug("feature tlids with blocks: %d" % len(tiger.tlids_with_blocks)) import pprint tlids_wo_blocks = set(tiger.featnames_db.keys()).difference(tiger.tlids_with_blocks) logger.debug("feature tlids WITHOUT blocks: %d" % len(tlids_wo_blocks)) all_rows = [] for t in tlids_wo_blocks: all_rows.extend(tiger.featnames_db[t]) logger.debug("Rows: %d" % len(all_rows)) names = [(r['FULLNAME'], r['TLID']) for r in all_rows] names.sort() logger.debug( "=================") for n, t in names: logger.debug("%s %s" % (n, t)) for tlid in sorted(tlids_wo_blocks)[:10]: feat = tiger.featnames_db[tlid] logger.debug(pprint.pformat(feat))
def main(argv=None): if argv is None: argv = sys.argv[1:] parser = optparse.OptionParser( usage='%prog edges.shp featnames.dbf faces.dbf place.shp') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False) parser.add_option('-c', '--city', dest='city', help='A city name to filter against') parser.add_option( '-f', '--fix-cities', action="store_true", default=False, help='Whether to override "city" attribute of blocks and ' 'streets by finding an intersecting Location of a city-ish ' 'type. Only makes sense if you have configured ' 'multiple_cities=True in the METRO_LIST of your settings.py, ' 'and after you have created some appropriate Locations.') parser.add_option( '-b', '--filter-bounds', action="store", default=1, type='int', help='Whether to skip blocks outside the metro extent from your ' 'settings.py. Default 1 (true); use 0 to disable.') parser.add_option( '-l', '--filter-location', action="append", help='A location (spelled as location-type-slug:location-slug) ' 'that will be used to filter out blocks outside its boundaries. ' 'May be passed more than once.') parser.add_option('-e', '--encoding', dest='encoding', help='Encoding to use when reading the shapefile', default='utf8') (options, args) = parser.parse_args(argv) if len(args) != 4: return parser.error('must provide 4 arguments, see usage') if options.filter_bounds: from ebpub.utils.geodjango import get_default_bounds filter_bounds = get_default_bounds() else: filter_bounds = None # Optionally filter on bounds of some Locations too. loc_bounds = None for locslug in options.filter_location or []: typeslug, locslug = locslug.split(':', 1) from ebpub.db.models import Location location = Location.objects.get(location_type__slug=typeslug, slug=locslug) if loc_bounds is None: loc_bounds = location.location else: loc_bounds = loc_bounds.union(location.location) if None not in (filter_bounds, loc_bounds): filter_bounds = filter_bounds.intersection(loc_bounds) elif loc_bounds is not None: filter_bounds = loc_bounds else: filter_bounds = filter_bounds tiger = TigerImporter(*args, verbose=options.verbose, filter_city=options.city, filter_bounds=filter_bounds, encoding=options.encoding, fix_cities=options.fix_cities) if options.verbose: import logging logger.setLevel(logging.DEBUG) num_created = tiger.save() logger.info("Created %d new blocks" % num_created) logger.debug("... from %d feature names" % len(tiger.featnames_db)) logger.debug("feature tlids with blocks: %d" % len(tiger.tlids_with_blocks)) import pprint tlids_wo_blocks = set(tiger.featnames_db.keys()).difference( tiger.tlids_with_blocks) logger.debug("feature tlids WITHOUT blocks: %d" % len(tlids_wo_blocks)) all_rows = [] for t in tlids_wo_blocks: all_rows.extend(tiger.featnames_db[t]) logger.debug("Rows: %d" % len(all_rows)) names = [(r['FULLNAME'], r['TLID']) for r in all_rows] names.sort() logger.debug("=================") for n, t in names: logger.debug("%s %s" % (n, t)) for tlid in sorted(tlids_wo_blocks)[:10]: feat = tiger.featnames_db[tlid] logger.debug(pprint.pformat(feat))
def gen_blocks(self, feature): block_fields = {} tlid = feature.get('TLID') for field_key, feature_key in (('right_from_num', 'RFROMADD'), ('left_from_num', 'LFROMADD'), ('right_to_num', 'RTOADD'), ('left_to_num', 'LTOADD')): block_fields[field_key] = feature.get(feature_key) block_fields['right_zip'] = feature.get('ZIPR') block_fields['left_zip'] = feature.get('ZIPL') for side in ('right', 'left'): block_fields[side + '_city'] = self._get_city( feature, side[0].upper()).upper() block_fields[side + '_state'] = self._get_state( feature, side[0].upper()).upper() if tlid in self.featnames_db: suffix_standardizer = geocoder_parsing.STANDARDIZERS['suffix'] suffix_matcher = geocoder_parsing.TOKEN_REGEXES['suffix'] for featname in self.featnames_db[tlid]: # Prefix eg. 'STATE HWY'. block_fields['prefix'] = featname.get('PRETYPABRV', '').upper().strip() # Main part of the name, eg. 'MAIN' block_fields['street'] = featname['NAME'].upper().strip() # Prefix direction eg. 'N'. block_fields['predir'] = featname['PREDIRABRV'].upper().strip() # Suffix direction eg. 'SW'. block_fields['postdir'] = featname['SUFDIRABRV'].upper().strip( ) # Road type, eg. 'ST', 'AVE', 'PKWY'. block_fields['suffix'] = featname['SUFTYPABRV'].upper().strip() if not block_fields['suffix']: # Bug in the data: # Many streets named eg. 'Wilson Park' put the whole thing in the # name and nothing in the suffix. # This breaks our geocoder, because it parses 'Park' as the suffix # and expects to find it in that field. # So, check if the street name ends with a recognized suffix. if block_fields['street'].count(' '): name_parts = block_fields['street'].split() raw_suffix = name_parts.pop() street = ' '.join(name_parts) if suffix_matcher.match(raw_suffix): block_fields['suffix'] = suffix_standardizer( raw_suffix) block_fields['street'] = street # More bugs in data: some auxiliary roads have the prefix as # part of the name in nonstandard format. if not block_fields['prefix']: prefix, street = None, None if block_fields['street'].startswith('INTERSTATE '): prefix, street = block_fields['street'].split(' ', 1) elif block_fields['street'].startswith('I-'): prefix, street = block_fields['street'].split('-', 1) if prefix and street: logger.debug("Splitting prefix %r out of street %r" % (prefix, street)) block_fields['street'] = street.strip() block_fields['prefix'] = prefix.strip() yield block_fields.copy() self.tlids_with_blocks.add(tlid)