Example #1
0
    def x_test_errors(self):
        from databundles.geo.address import Parser
        import imp
        import os
        import csv

        parser = Parser()

        bundle = imp.load_source('bundle', 
            '/Users/eric/proj/Bundles/src/civicdata/sandiego.gov/sandiego.gov-businesses-orig/bundle.py')
        b = bundle.Bundle()

        p = b.partitions.find(table='businesses', grain='errors')

        f_output =  os.path.join(os.path.dirname(__file__),'support','business_addresses.out.csv')
        with open(f_output, 'w') as out:
            writer = csv.DictWriter(out, self.header)
            writer.writeheader()

            for row in p.query("SELECT * FROM businesses"):
    
                ps = parser.parse(row['address'])
    
                d = ps.as_dict()
                d['input'] = row['address'].strip()
                d['output'] = str(ps)
                writer.writerow(d)
Example #2
0
    def test_address_files(self):
        import os           
        from databundles.geo.address import Parser
        import csv

        parser = Parser()
    
        success = 0
        failure = 0
        total = 0
        filename = "crime_addresses"
        f_input =  os.path.join(os.path.dirname(__file__),'support',filename + '.txt')
        f_output =  os.path.join(os.path.dirname(__file__),'support',filename + '.out.csv')
        with open(f_output, 'w') as out:
            writer = csv.DictWriter(out, self.header)
            writer.writeheader()
            with open(f_input) as f:
                for line in f:
             
                    total += 1
             
                    print '----'
                    print line.strip()
             
                    try: 
                        ps = parser.parse(line)
                        if not ps:
                            failure += 1
                            continue
                    except Exception as e:
                        print "ERROR", e
                        failure += 1
                        continue

                    d = ps.as_dict()
                    d['input'] = line.strip()
                    d['output'] = str(ps)
                    writer.writerow(d)

                    if not ps.city:
                        failure += 1
                        print d
                        print ps
                        print
                    else:
 
                        success += 1
                
            print 
            print "total={} success={} failure={} rate={}".format(total, success, failure, round((float(failure)/float(total)*100), 3))
Example #3
0
    def __init__(self, library, **kwargs):
        """
        
        Args:
            geocoder_ds: addresses dataset dependency name. Defaults to 'addresses'
        """
        
        from databundles.geo.address import Parser
        from databundles.dbexceptions import ConfigurationError
        
        self.parser = Parser()
        
        addressesds = kwargs.get('geocoder_ds', 'geocoder')
        
        
        try:
            self.addresses = library.dep(addressesds).partition
         
        except ConfigurationError:
            raise ConfigurationError(("MISSING DEPENDENCY: To get addresses or codes, the configuration  "+
                "must specify a dependency with a set named '{0}', in build.dependencies.{0}"+
                "See https://github.com/clarinova/databundles/wiki/Error-Messages#geogeocodergeocoder__init__")
                .format(addressesds))

        self.by_scode, self.by_name = self.jur_codes()
Example #4
0
class Geocoder(object):

    def __init__(self, library, **kwargs):
        """
        
        Args:
            geocoder_ds: addresses dataset dependency name. Defaults to 'addresses'
        """
        
        from databundles.geo.address import Parser
        from databundles.dbexceptions import ConfigurationError
        
        self.parser = Parser()
        
        addressesds = kwargs.get('geocoder_ds', 'geocoder')
        
        
        try:
            self.addresses = library.dep(addressesds).partition
         
        except ConfigurationError:
            raise ConfigurationError(("MISSING DEPENDENCY: To get addresses or codes, the configuration  "+
                "must specify a dependency with a set named '{0}', in build.dependencies.{0}"+
                "See https://github.com/clarinova/databundles/wiki/Error-Messages#geogeocodergeocoder__init__")
                .format(addressesds))

        self.by_scode, self.by_name = self.jur_codes()
         
    def get_srs(self):
        return self.addresses.get_srs() 
        
    def geocode(self, street):
        """Calls either geocode_street() geocode_intersection()"""
        
        result = None
        if ' / ' in  street:
            
            s1, s2 = street.split('/',1)
            result = self.geocode_intersection(s1,s2)
       
        else:
            try:address = self.geocode_street(street)
            except Exception as e:
                raise
                address = None

            if address and address['score'] > 20:
                result = address
                
        return result
          
    def geocode_address(self, address):
        """Geocode an address"""
       
        # Current implementation just interpolates from the low to high address
       
        try: ps = self.parser.parse(address)
        except: ps = False
    
        if not ps:
            return None   
       
        r = {'address': None, 'segment': None}
       
        segment = self._geocode_segment(ps)
       
        if not segment:
            return None

        r['segment'] = segment

        # Try to get a specific address within the segment. 
        if ps.number <= segment['hnumber'] and ps.number >= segment['lnumber']:
           
            address = self.addresses.query("""
                SELECT * FROM addresses WHERE segment_source_id = ? 
                ORDER BY ABS(number - ?) ASC LIMIT 1""", segment['segment_source_id'], ps.number).first()
              
            if address:
                if  abs( int(address['number']) - ps.number) < (segment['hnumber'] - segment['lnumber']) :
                    r['address'] = address
                    r['gctype'] = 'cns/address'
                    x = address['x']
                    y = address['y']
                    r['gcquality'] = abs( int(address['number']) - ps.number)
                    ps.number = address['number'] # For re-coding the address later. 
                else:
                    pass
                    #print abs( int(address['number']) - ps.number), address['number'],  segment['lnumber'],ps.number, segment['hnumber']
                

        if not r['address']:
            try:
                
                number = ps.number
                number_low = segment['lnumber']
                number_high = segment['hnumber']
                x1,y1,x2,y2 = segment['x1'], segment['y1'], segment['x2'], segment['y2']
                
                m = float(y2-y1) / float(x2-x1)
                b = y1 - m*x1
                
                # Ratio to convert house number units into x units. 
                ntx = float(x2-x1) / float(  number_high - number_low )
                x = ntx*(number-number_low)+x1
                
                # Plain old linear equation
                y = m*x + b 
                
                r['gctype'] = 'cns/seginterp'
                
                #
                #  Punt and use the midpoint if the difference is too large. 
                #
                import math
                
                # Distance from the address to the midpoint
                xd = segment['xm'] - x
                yd = segment['ym'] - y
                d1 = math.sqrt(xd*xd + yd*yd)
    
                # distance from end to end. 
                xd = x1 - x2
                yd = y1 - y2
                d2 = math.sqrt(xd*xd + yd*yd)
                
                r['gcquality'] = int(d2 - d1)
                
                if d1 > d2:
                    raise Exception()
                    #print "d: ", r, d1, d2
                    #print "n: ", number, number_high, number_low
                
            except:
                x = segment['xm']
                y = segment['ym']
                r['gctype'] = 'cns/segmid'
                r['gcquality'] = 0
    
        #ps.number = segment['number']
        
        ps.city = segment['city']
        ps.street_name = segment['street']
        ps.street_type = segment['street_type']
        ps.street_direction = segment['street_dir']

        r['x'] = x
        r['y'] = y
        
        if str(ps):
            r['codedaddress'] = str(ps)
        else:
            r['codedaddress'] = None

        return r
        
    def geocode_street(self, street):
        """Geocode an address to a street segment"""
        try: ps = self.parser.parse(street)
        except: ps = False
    
        if not ps:
            return None       
        
        return self._geocode_segment(ps)
         
    def _geocode_segment(self, ps):
        """Geocode an address to a street segment"""

        direction = ps.street_direction
        street = ps.street_name
        street_type = ps.street_type
        number = ps.number
        
        q = """SELECT  * FROM segments WHERE street = ?""";

        # If this fails, the "city" is probably an unincorporated place, which is in the county. 
        try: in_city = self.by_name[ps.city.title()]
        except: in_city = self.by_name['NONE']
           

        max_score = 0
        winner = None
       
        for s in self.addresses.query(q, street):
            
            s= dict(s)
             
            s['score']  = score = self.rank_street(s, number,  direction, street_type, in_city)

            if in_city == s['rcity']:
                s['city'] = s['rcity']   
            elif in_city == s['lcity']:
                s['city'] = s['lcity']   
            else:
                s['city'] = None                 

            if not winner or score > max_score:
                winner = s
                max_score = score

        if winner:
            
            winner['lat'] = winner['latc']
            winner['lon'] = winner['lonc']
            winner['x'] = winner['xc']
            winner['y'] = winner['yc']
            winner['gctype'] = 'cns/segment'
            winner['gcquality'] = winner['score']     
            
        return winner

    def geocode_intersection(self, street1, street2):

        try: 
            ps1 = self.parser.parse(street1)
            ps2 = self.parser.parse(street2)
        except:
            return None
    
        if not ps1 or not ps2:
            return None

        q = """SELECT  * FROM nodes 
        WHERE street_1 = ? and street_2 = ?
        OR street_1 = ? and street_2 = ? LIMIT 1""";
      
        intr = self.addresses.query(q, ps1.street_name, ps2.street_name, ps2.street_name, ps1.street_name).first()

        if intr:
            winner = dict(intr)
            winner['gctype'] = 'cns/intersection'
            return winner
        else:
            return None
        
    def jur_codes(self):
        
        by_scode = {}
        by_name = {}
        for place in self.addresses.query("SELECT code, scode, name FROM places WHERE type = 'city'"):
            by_scode[place['scode']] = (place['code'], place['name'])
            by_name[place['name']] = place['code']
          
        by_name['County Unincorporated'] = 'SndSDO'
        by_name['Unincorporated'] = 'SndSDO'
        by_name['NONE'] = 'SndSDO'
          
        return by_scode, by_name
       
    def rank_street(self, row, number, direction, street_type, city ):
        """ Create a score for a street segment based on how well it matches the input"""
        
        score = 0
        
        if (row['street_dir'] or direction) or (not row['street_dir']  and not direction):
            if row['street_dir'] == direction:
                score += 10
            
        if row['street_type'] == street_type:
            score += 10    


        #print "Rank Street", city, row['rcity'], row['lcity']
        if city == row['rcity'] or city == row['lcity']:
            score += 20  

        if number >= row['lnumber'] and number <=row['hnumber']:
            score += 25
        elif number:
            numdist = min( abs(number-row['lnumber']), abs(number-row['hnumber']))
            
            if numdist < 1500:
                score += int((1500-numdist) / 100) # max of 15 points
        
    
        return score

    def _do_search(self, queries, number, street, street_type, city, state):

        if not number:
            return [];

        for quality, query, args in queries:

            candidates = {}
            print query, args
            for ar in self.addresses.query(query, *args  ):
                ar = dict(ar)
                
                city = city.title() if city else ar.get('city', ar.get('rcity', None))

                r = {
                    'quality': quality,
                    'addresses_id': ar.get('addresses_id'),
                    'segment_source_id':  ar.get('segment_source_id'),
                    'address_source_id': ar.get('addr_source_id'),
                    'zip': ar.get('zip'),
                    'street': ar['street'],
                    'street_dir': ar.get('street_dir',None),
                    'street_type': ar['street_type'],
                    'x': ar.get('x'),
                    'y': ar.get('y'),
                    'lat': ar.get('lat'),
                    'lon': ar.get('lon'),
                    'number': ar.get('number', ar.get('lnumber')),
                    'city' : city
                }
                
                candidates.setdefault((city,ar['street'],ar['street_type']),[]).append(r)

            if len(candidates) > 0:
                return candidates

        return []

    def get_street_addresses(self, segment_source_id):
        
        addresses = {}
        
        for ar in self.addresses.query("SELECT * FROM addresses WHERE segment_source_id = ?", segment_source_id):
            addresses[ar['number']] = dict(ar)
            
        return addresses

      
    def geocode_semiblock(self, street, city, state):
        """ Just parses the street,. Expects the city, state and zip to be broken out. """

        try: ps = self.parser.parse(street)
        except: ps = False
        
        if not ps:
            return  []

        number = ps.number
        street = ps.street_name
        street_type = ps.street_type

        if not number:
            return [];

        city = city.title()
        street = street.title()

        queries = [
            ("""SELECT 10 as gcquality, * FROM segments WHERE  (lcity = ?  or rcity = ? )
            AND street = ? AND street_type = ? AND ? BETWEEN lnumber AND hnumber
            AND has_addresses = 1
            ORDER BY hnumber ASC""",(city,  city, street, street_type, number)),
                   
            ("""SELECT 9 as gcquality, * FROM segments WHERE (lcity = ?  or rcity = ? ) AND 
            street = ? AND ? BETWEEN lnumber AND hnumber
            AND has_addresses = 1
            ORDER BY hnumber ASC""",(city, city,  street, number)),
                   
            ("""SELECT 8 as gcquality, * FROM segments WHERE (lcity = ?  or rcity = ? ) AND 
            street = ? AND ? BETWEEN lnumber AND hnumber
            ORDER BY hnumber ASC""",(city, city,  street, number)),
                   
            ("""SELECT 7 as gcquality, * FROM segments WHERE street = ? AND ? BETWEEN lnumber AND hnumber
            AND has_addresses = 1
            ORDER BY hnumber ASC""",(street, number)),
                   
        ]

        for query, args in queries:

            candidates = {}
          
            for ar in self.addresses.query(query, *args  ):
                ar = dict(ar)
                
                candidates.setdefault(ar['segment_source_id'],[]).append(ar)

            if len(candidates) > 0:
                return candidates

        return {}
    
    def _address_geocode_parts(self, number, street, street_type, city, state):

        if not number:
            return [];

        city = city.title()
        street = street.title()

        block_number = int(float(number)/100.0)*100

        queries = [
            (20, """SELECT * FROM addresses WHERE city = ? AND street = ? AND street_type = ? AND number = ?
            ORDER BY segment_source_id""",(city,  street, street_type, number )), 
            (19, """SELECT * FROM addresses WHERE city = ? AND street = ? AND number = ?
            ORDER BY segment_source_id""",(city,  street, number )), 
            (18, """SELECT * FROM addresses WHERE street = ? AND number = ?
            ORDER BY segment_source_id""",(street, number )),
            (17, """SELECT * FROM addresses WHERE city = ? AND street = ? AND street_type = ? AND number BETWEEN ? AND ?
            ORDER BY segment_source_id""",(city,  street, street_type, block_number,  str(int(block_number)+99)) ), 
            (16, """SELECT * FROM addresses WHERE city = ? AND street = ? AND number BETWEEN ? AND ?
            ORDER BY segment_source_id""",(city,  street, block_number,  str(int(block_number)+99)) ), 
            (15, """SELECT * FROM addresses WHERE street = ? AND number BETWEEN ? AND ?
            ORDER BY segment_source_id""",(street, block_number,  str(int(block_number)+99)) )
        ]

        return self._do_search(queries, number, street, street_type, city, state)