def save(self, old_record, list_record, detail_record): if old_record is not None: return property_type = self.get_or_create_lookup('property_type', list_record['property_type'], list_record['property_type']) year_of_mortgage = self.get_or_create_lookup( 'year_of_mortgage', list_record['year_of_mortgage'], list_record['year_of_mortgage']) newsitem_title = 'Foreclosure filed for property in the %s' % address_to_block( strip_unit(list_record['address'])) attributes = { 'original_principal': list_record['original_i'], 'property_type': property_type.id, 'year_of_mortgage': year_of_mortgage.id, 'pin_number': list_record['pin_number'], 'filing_date': list_record['filing_date'], 'case_number': list_record['case_number'], 'document_number': list_record['document_number'], 'raw_address': list_record['address'], } self.create_newsitem( attributes, convert_to_block=True, title=newsitem_title, item_date=list_record['filing_date'], location_name=strip_unit(list_record['address']), )
def save(self, old_record, list_record, detail_record): if old_record is not None: return property_type = self.get_or_create_lookup('property_type', list_record['property_type'], list_record['property_type']) year_of_mortgage = self.get_or_create_lookup('year_of_mortgage', list_record['year_of_mortgage'], list_record['year_of_mortgage']) purchaser = self.get_or_create_lookup('purchaser', list_record['purchaser'], list_record['purchaser']) newsitem_title = 'Property on the %s sold at foreclosure auction' % address_to_block(strip_unit(list_record['address'])) attributes = { 'original_principal': list_record['original_i'], 'property_type': property_type.id, 'year_of_mortgage': year_of_mortgage.id, 'pin_number': list_record['pin_number'], 'case_number': list_record['case_number'], 'document_number': list_record['document_number'], 'filing_date': list_record['filing_date'], 'raw_address': list_record['address'], 'auction_price': list_record['auction_price'], 'purchaser': purchaser.id, } self.create_newsitem( attributes, convert_to_block=True, title=newsitem_title, item_date=list_record['sale_date'], location_name=strip_unit(list_record['address']), )
def create_newsitem(self, attributes, **kwargs): """ Creates and saves a NewsItem with the given kwargs. Returns the new NewsItem. kwargs MUST have the following keys: title item_date location_name For any other kwargs whose values aren't provided, this will use sensible defaults. kwargs may optionally contain a 'convert_to_block' boolean. If True, this will convert the given kwargs['location_name'] to a block level but will use the real (non-block-level) address for geocoding and Block association. attributes is a dictionary to use to populate this NewsItem's Attribute object. """ block = location = None if 'location' not in kwargs: location = self.geocode(kwargs['location_name']) if location: block = location['block'] location = location['point'] if kwargs.pop('convert_to_block', False): kwargs['location_name'] = address_to_block(kwargs['location_name']) # If the exact address couldn't be geocoded, try using the # normalized location name. if location is None: location = self.geocode(kwargs['location_name']) if location: block = location['block'] location = location['point'] # Normally we'd just use "schema = kwargs.get('schema', self.schema)", # but self.schema will be evaluated even if the key is found in # kwargs, which raises an error when using multiple schemas. schema = kwargs.get('schema', None) schema = schema or self.schema ni = NewsItem.objects.create( schema=schema, title=kwargs['title'], description=kwargs.get('description', ''), url=kwargs.get('url', ''), pub_date=kwargs.get('pub_date', self.start_time), item_date=kwargs['item_date'], location=kwargs.get('location', location), location_name=kwargs['location_name'], location_object=kwargs.get('location_object', None), block=kwargs.get('block', block), ) if attributes is not None: ni.attributes = attributes self.num_added += 1 self.logger.info(u'Created NewsItem %s (total created in this scrape: %s)', ni.id, self.num_added) return ni
def create_newsitem(self, attributes, **kwargs): """ Creates and saves a NewsItem with the given kwargs. Returns the new NewsItem. kwargs MUST have the following keys: title item_date location_name For any other kwargs whose values aren't provided, this will use sensible defaults. kwargs may optionally contain a 'convert_to_block' boolean. If True, this will convert the given kwargs['location_name'] to a block level but will use the real (non-block-level) address for geocoding and Block association. attributes is a dictionary to use to populate this NewsItem's Attribute object. """ block = location = None if 'location' not in kwargs: location = self.geocode(kwargs['location_name']) if location: block = location['block'] location = location['point'] if kwargs.pop('convert_to_block', False): kwargs['location_name'] = address_to_block(kwargs['location_name']) # If the exact address couldn't be geocoded, try using the # normalized location name. if location is None: location = self.geocode(kwargs['location_name']) if location: block = location['block'] location = location['point'] # Normally we'd just use "schema = kwargs.get('schema', self.schema)", # but self.schema will be evaluated even if the key is found in # kwargs, which raises an error when using multiple schemas. schema = kwargs.get('schema', None) schema = schema or self.schema ni = NewsItem.objects.create( schema=schema, title=kwargs['title'], description=kwargs.get('description', ''), url=kwargs.get('url', ''), pub_date=kwargs.get('pub_date', self.start_time), item_date=kwargs['item_date'], location=kwargs.get('location', location), location_name=kwargs['location_name'], location_object=kwargs.get('location_object', None), block=kwargs.get('block', block), ) ni.attributes = attributes self.num_added += 1 self.logger.info(u'Created NewsItem %s (total created in this scrape: %s)', ni.id, self.num_added) return ni
def normalize_address(self, record): """ Addresses are provided with no spaces, so try to find the suffix if there is one, then compare the reamining part to the streets table. """ if record['offensestreet'] is None: raise SkipRecord('Skipping record with no street') street = record['offensestreet'] matching_suffix = '' suffix_groups = [ ('EXPWY', 'PKWY', 'FRWY', 'BLVD'), ('HWY', 'FRW', 'AVE', 'CIR', 'EXT', 'BLV', 'PKW', 'ROW', 'WAY', 'EXP'), ('DR', 'ST', 'RD', 'LN', 'BL', 'TR', 'WY', 'CT', 'PL', 'AV', 'CI'), ('P', 'R', 'F', 'D', 'S', 'L') ] match_found = False for group in suffix_groups: if match_found: break for suffix in group: if record['offensestreet'].endswith(suffix): street_name = record['offensestreet'][:-len(suffix)] # Try looking up the street name from a dictionary mapping # collapsed street names to names in the streets table. try: street = self.streets[street_name] matching_suffix = suffix match_found = True break except KeyError: # SAINT is encoded as ST in the data, but Saint in the streets table, # so try that if the address starts with ST. if street_name.startswith('ST'): street_name = 'SAINT%s' % street_name[2:] try: street = self.streets[street_name] matching_suffix = suffix match_found = True break except KeyError: continue if match_found: self.matches_found += 1 self.records_seen += 1 normalized_block = record['offenseblock'].lstrip('0') if normalized_block[-2:] == 'xx': normalized_block = normalized_block.replace('xx', '00') address = '%s %s %s %s' % ( normalized_block, record['offensedirection'] or '', street, matching_suffix ) address = re.sub(r'\s+', ' ', address) return address_to_block(address)
def normalize_address(self, record): """ Addresses are provided with no spaces, so try to find the suffix if there is one, then compare the reamining part to the streets table. """ if record['offensestreet'] is None: raise SkipRecord('Skipping record with no street') street = record['offensestreet'] matching_suffix = '' suffix_groups = [('EXPWY', 'PKWY', 'FRWY', 'BLVD'), ('HWY', 'FRW', 'AVE', 'CIR', 'EXT', 'BLV', 'PKW', 'ROW', 'WAY', 'EXP'), ('DR', 'ST', 'RD', 'LN', 'BL', 'TR', 'WY', 'CT', 'PL', 'AV', 'CI'), ('P', 'R', 'F', 'D', 'S', 'L')] match_found = False for group in suffix_groups: if match_found: break for suffix in group: if record['offensestreet'].endswith(suffix): street_name = record['offensestreet'][:-len(suffix)] # Try looking up the street name from a dictionary mapping # collapsed street names to names in the streets table. try: street = self.streets[street_name] matching_suffix = suffix match_found = True break except KeyError: # SAINT is encoded as ST in the data, but Saint in the streets table, # so try that if the address starts with ST. if street_name.startswith('ST'): street_name = 'SAINT%s' % street_name[2:] try: street = self.streets[street_name] matching_suffix = suffix match_found = True break except KeyError: continue if match_found: self.matches_found += 1 self.records_seen += 1 normalized_block = record['offenseblock'].lstrip('0') if normalized_block[-2:] == 'xx': normalized_block = normalized_block.replace('xx', '00') address = '%s %s %s %s' % (normalized_block, record['offensedirection'] or '', street, matching_suffix) address = re.sub(r'\s+', ' ', address) return address_to_block(address)
def normalize_address(self, record): """ Addresses are provided with no spaces, so try to find the suffix if there is one, then compare the reamining part to the streets table. """ if record["offensestreet"] is None: raise SkipRecord("Skipping record with no street") street = record["offensestreet"] matching_suffix = "" suffix_groups = [ ("EXPWY", "PKWY", "FRWY", "BLVD"), ("HWY", "FRW", "AVE", "CIR", "EXT", "BLV", "PKW", "ROW", "WAY", "EXP"), ("DR", "ST", "RD", "LN", "BL", "TR", "WY", "CT", "PL", "AV", "CI"), ("P", "R", "F", "D", "S", "L"), ] match_found = False for group in suffix_groups: if match_found: break for suffix in group: if record["offensestreet"].endswith(suffix): street_name = record["offensestreet"][: -len(suffix)] # Try looking up the street name from a dictionary mapping # collapsed street names to names in the streets table. try: street = self.streets[street_name] matching_suffix = suffix match_found = True break except KeyError: # SAINT is encoded as ST in the data, but Saint in the streets table, # so try that if the address starts with ST. if street_name.startswith("ST"): street_name = "SAINT%s" % street_name[2:] try: street = self.streets[street_name] matching_suffix = suffix match_found = True break except KeyError: continue if match_found: self.matches_found += 1 self.records_seen += 1 normalized_block = record["offenseblock"].lstrip("0") if normalized_block[-2:] == "xx": normalized_block = normalized_block.replace("xx", "00") address = "%s %s %s %s" % (normalized_block, record["offensedirection"] or "", street, matching_suffix) address = re.sub(r"\s+", " ", address) return address_to_block(address)
def clean_list_record(self, record): # Save the raw address so we can use it to find duplicate records in # the future. address = smart_title(record['address'].strip().replace('&', '&').replace(' ', ' ')).strip() record['raw_address'] = address record['address'] = address_to_block(clean_address(address)) record['disposition'] = record['disposition'].replace('&', '&').replace(' ', ' ').strip() or 'Not available' record['event'] = record['event'].replace('&', '&').replace(' ', ' ').strip() item_date = parse_date(record['datetime'], '%m/%d/%Y %I:%M:%S %p', return_datetime=True) record['item_date'] = item_date.date() record['item_time'] = item_date.time() # Normalize this value. if record['disposition'] == 'CANCCOMM': record['disposition'] = 'CANCELLED BY COMMUNICATIONS' return record
return {'type': 'address', 'result': results, 'ambiguous': True} else: return {'type': 'address', 'result': results[0], 'ambiguous': False} except InvalidBlockButValidStreet, e: result = { 'type': 'block', 'ambiguous': True, 'result': e.block_list, 'street_name': e.street_name, 'block_number': e.block_number, } if convert_to_block: # If the exact address couldn't be geocoded, try using the # normalized block name. block_name = address_to_block(query) if block_name != query: try: result['result'] = BlockGeocoder()._do_geocode(block_name) result['result']['address'] = block_name result['ambiguous'] = False logger.debug('Resolved %r to block %r' % (query, block_name)) except (InvalidBlockButValidStreet, AmbiguousResult): pass if result['ambiguous']: logger.debug('Invalid block for %r, returning all possible blocks' % query) return result except: raise
def create_newsitem(self, attributes, **kwargs): """ Creates and saves a NewsItem with the given kwargs. Returns the new NewsItem. kwargs MUST have the following keys: * title * item_date * location_name AND/OR location For any other kwargs whose values aren't provided, this will use sensible defaults. ``attributes`` is a dictionary to use to populate this NewsItem's Attribute objects. kwargs MAY have the following keys: zipcode, city, and/or state used to disambiguate geocoded locations. convert_to_block convert the given kwargs['location_name'] to a block level but will try to use the real (non-block-level) address for geocoding. Default False. """ convert_to_block = kwargs.pop('convert_to_block', False) location, location_name = self.geocode_if_needed( kwargs.get('location', None), kwargs.get('location_name', None), zipcode=kwargs.pop('zipcode', None), city=kwargs.pop('city', None), state=kwargs.pop('state', None), convert_to_block=convert_to_block, ) assert location or location_name, "At least one of location or location_name must be provided" if convert_to_block: location_name = address_to_block(location_name) kwargs['location_name'] = location_name kwargs['location'] = location # Normally we'd just use "schema = kwargs.get('schema', self.schema)", # but self.schema will be evaluated even if the key is found in # kwargs, which raises an error when using multiple schemas. schema = kwargs.get('schema', None) or self.schema ni = NewsItem.objects.create( schema=schema, title=kwargs['title'], description=kwargs.get('description', ''), url=kwargs.get('url', ''), pub_date=kwargs.get('pub_date', self.start_time), item_date=kwargs['item_date'], location=location, location_name=location_name, location_object=kwargs.get('location_object', None), ) if attributes is not None: ni.attributes = attributes self.num_added += 1 self.logger.info( u'Created NewsItem %s: %s (total created in this scrape: %s)', schema.slug, ni.id, self.num_added) return ni
'result': results[0], 'ambiguous': False } except InvalidBlockButValidStreet, e: result = { 'type': 'block', 'ambiguous': True, 'result': e.block_list, 'street_name': e.street_name, 'block_number': e.block_number, } if convert_to_block: # If the exact address couldn't be geocoded, try using the # normalized block name. block_name = address_to_block(query) if block_name != query: try: result['result'] = BlockGeocoder()._do_geocode(block_name) result['result']['address'] = block_name result['ambiguous'] = False logger.debug('Resolved %r to block %r' % (query, block_name)) except (InvalidBlockButValidStreet, AmbiguousResult): pass if result['ambiguous']: logger.debug( 'Invalid block for %r, returning all possible blocks' % query) return result except:
def create_newsitem(self, attributes, **kwargs): """ Creates and saves a NewsItem with the given kwargs. Returns the new NewsItem. kwargs MUST have the following keys: * title * item_date * location_name AND/OR location For any other kwargs whose values aren't provided, this will use sensible defaults. ``attributes`` is a dictionary to use to populate this NewsItem's Attribute objects. kwargs MAY have the following keys: zipcode, city, and/or state used to disambiguate geocoded locations. convert_to_block convert the given kwargs['location_name'] to a block level but will try to use the real (non-block-level) address for geocoding. Default False. """ convert_to_block = kwargs.pop('convert_to_block', False) location, location_name = self.geocode_if_needed( kwargs.get('location', None), kwargs.get('location_name', None), zipcode=kwargs.pop('zipcode', None), city=kwargs.pop('city', None), state=kwargs.pop('state', None), convert_to_block=convert_to_block, ) assert location or location_name, "At least one of location or location_name must be provided" if convert_to_block: location_name = address_to_block(location_name) kwargs['location_name'] = location_name kwargs['location'] = location # Normally we'd just use "schema = kwargs.get('schema', self.schema)", # but self.schema will be evaluated even if the key is found in # kwargs, which raises an error when using multiple schemas. schema = kwargs.get('schema', None) or self.schema ni = NewsItem.objects.create( schema=schema, title=kwargs['title'], description=kwargs.get('description', ''), url=kwargs.get('url', ''), pub_date=kwargs.get('pub_date', self.start_time), item_date=kwargs['item_date'], location=location, location_name=location_name, location_object=kwargs.get('location_object', None), ) if attributes is not None: ni.attributes = attributes self.num_added += 1 self.logger.info(u'Created NewsItem %s: %s (total created in this scrape: %s)', schema.slug, ni.id, self.num_added) return ni