Beispiel #1
0
    def build_addresses(self):
        """Geocode the addresses and build an address table"""

        from ambry.geo.geocoders import DstkGeocoder

        facilities = self.partitions.find(table='facilities')

        def address_gen():
            for row in facilities.query("SELECT * FROM facilities"):
                address = "{}, {}, {} {}".format(row['dba_address1'],
                                                 row['dba_city'], 'CA',
                                                 row['dba_zip_code'])
                yield (address, row)

        dstk_service = self.config.service('dstk')

        dstk_gc = DstkGeocoder(dstk_service, address_gen())

        p = self.partitions.find_or_new(table='facilities_addresses')
        p.clean()

        lr = self.init_log_rate(500)

        with p.inserter() as ins:
            for i, (k, r, inp_row) in enumerate(dstk_gc.geocode()):
                lr("Addresses " + str(i))
                r['facilities_id'] = inp_row['id']
                ins.insert(r)
Beispiel #2
0
    def build_dstk_geocoder(self):
        """Geocode with the Data Science Toolkit"""
        from ambry.geo.geocoders import DstkGeocoder
    
        lr = self.init_log_rate(250)
    
        businesses = self.partitions.find(table='businesses')
    
        def address_gen():
            for row in businesses.query("SELECT * FROM businesses"):
                address = "{}, {}, {} {}".format(row['address'], row['city'], row['state'], row['zip'])
                yield (address, row)
    
        dstk_service = self.config.service('dstk')
        
        dstk_gc = DstkGeocoder(dstk_service, address_gen())
    
        p = self.partitions.find_or_new(table = 'dstk_addresses')
        p.clean()
    
        good = 0
        bad = 0
    
        with p.inserter() as ins:
            
            for i, (k, r, inp_row) in enumerate(dstk_gc.geocode()):

                row = {
                    'businesses_id' : inp_row['id']
                }
                
                if r:
                    row.update(dict(r))
                    row['number'] = r.get('street_number', None) 
                    row['name'] = r.get('street_name', None) 
                    row['city'] = r.get('locality', None)
                    row['state'] = r.get('region', None)
                    row['lat'] = r.get('latitude', None)
                    row['lon'] = r.get('longitude', None)
                    row['county'] = r.get('fips_county', None) 
                
                lr("Geocode DSTK")
                
                ins.insert(row)
                
                if self.run_args.test and i > 500:
                    break
Beispiel #3
0
    def test_dstk_geocoding(self):
        from ambry.geo.geocoders import DstkGeocoder
        import pprint

        l = self.bundle.library
        p = l.get('sandiegodata.org-bad_addresses-casnd-addresses').partition

        dstk_service = self.rc.service('dstk')

        def address_gen():
            for row in p.query("SELECT text from addresses where address_id is NULL limit 20"):
                text = row.text
                yield text

        dstk_gc = DstkGeocoder(dstk_service, address_gen())

        for k, r in dstk_gc.geocode():
            print '---'
            print "{:6s} {}".format(str(r['confidence']) if r else '', k)
            pprint.pprint(r)
Beispiel #4
0
    def test_dstk_geocoding(self):
        from ambry.geo.geocoder import Geocoder
        from ambry.geo.geocoders import DstkGeocoder
        import pprint

        l = self.bundle.library
        p = l.get('sandiegodata.org-bad_addresses-casnd-addresses').partition

        dstk_service = self.rc.service('dstk')

        def address_gen():
            for row in p.query(
                    "SELECT text from addresses where address_id is NULL limit 20"
            ):
                text = row.text
                yield text

        dstk_gc = DstkGeocoder(dstk_service, address_gen())

        for k, r in dstk_gc.geocode():
            print '---'
            print "{:6s} {}".format(str(r['confidence']) if r else '', k)
            pprint.pprint(r)
Beispiel #5
0
    def generate_agencies(self):
        """Load the agency list from the web, and yield geocoded address records"""
        import csv
        from ambry.geo.geocoders import DstkGeocoder
        from collections import defaultdict

        def address_gen():
            
            for row in self.partitions.find(table='sdfb_partners').rows:

                yield ("{} {}, CA {}".format(row['addr1'].decode('ascii','ignore'), row['city'], row['zip']),
                            (row['agencyref'].strip(), None, row['agencyname'].strip()))
            
            for row in self.partitions.find(table='agency_list').rows:
                
                yield (row['address'].decode('ascii','ignore'), 
                      (row['agency_id'], row['site_id'], row['name'].strip()))



        dstk_gc = DstkGeocoder(self.config.service('dstk'), address_gen())

        header = 'agency_id site_id name orig_address geocoded_address city lat lon'.split()

        for i, (k, r, o) in enumerate(dstk_gc.geocode()):
            
            row = [o[0],o[1],o[2],k]
            
            if r:
                row += [r['street_address'], r['locality'], r['latitude'], r['longitude']]
                
            yield dict(
                i = i,
                address = k, 
                geocoded = r,
                row = dict(zip(header, row ))
            )