Exemple #1
0
    def test_ba_geocoding(self):
        from ambry.geo.geocoder import Geocoder

        l = self.bundle.library

        gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition

        g = Geocoder(gp)

        p = l.get('sandiegodata.org-bad_addresses-casnd-addresses').partition

        for row in p.query("SELECT text from addresses where address_id is NULL limit 10"):

            text = row.text

            text = text.replace('La Jolla', 'San Diego')

            addr_id, r, parsed = g.parse_and_code(text)

            score = r['score'] if r else None

            print '------', score, addr_id
            print row.p
            print '> ', text
            print '< ', parsed
Exemple #2
0
    def test_csv_geocoding(self):
        from ambry.geo.geocoder import Geocoder
        import test.support as ts
        import os.path

        import csv

        l = self.bundle.library

        gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition

        g = Geocoder(gp)

        with open(os.path.join(os.path.dirname(ts.__file__), 'bad_geocodes.csv')) as f:
            reader = csv.DictReader(f)

            for row in reader:

                text = row['text']

                addr_id, r, parsed = g.parse_and_code(text)

                score = r['score'] if r else None

                print '------', score, addr_id
                print '> ', text
                print '< ', parsed
Exemple #3
0
    def test_geocoding_csv_geocoder(self):
        from ambry.geo.geocoder import Geocoder
        import test.support as ts
        import os.path

        import csv

        l = self.bundle.library

        gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition

        g = Geocoder(gp)

        for row in gp.query("select * from geocoder where number > 0 limit 1000"):

            text = "{number} {dir} {name} {suffix}, {city}, {state} {zip}".format(
                number = row.number, name=row.name, state=row.state,
                city=row.city if row.city else '',
                dir = row.direction if row.direction != '-' else '',
                suffix=row.suffix if row.suffix != '-' else '',
                zip = row.zip if row.zip > 0  else ''
            )



            addr_id, r, parsed = g.parse_and_code(text)

            if not r:
                score = r['score'] if r else None

                print '------', score, addr_id
                print '> ', text
                print '< ', parsed
Exemple #4
0
    def test_txt_geocoding(self):
        from ambry.geo.geocoder import Geocoder
        import test.support as ts
        import os.path

        import csv

        l = self.bundle.library

        gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition

        city_subs = {
            'La Jolla': 'San Diego'
        }

        g = Geocoder(gp, city_subs)

        with open(os.path.join(os.path.dirname(ts.__file__), 'bad_geocodes.txt')) as f:
            for line in f:
                text = line.strip()

                addr_id, r, parsed = g.parse_and_code(text)

                score = r['score'] if r else None

                print '------', score, addr_id
                print '> ', text
                print '< ', parsed
Exemple #5
0
    def test_csv_geocoding(self):
        from ambry.geo.geocoder import Geocoder
        import test.support as ts
        import os.path

        import csv

        l = self.bundle.library

        gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition

        g = Geocoder(gp)

        with open(
                os.path.join(os.path.dirname(ts.__file__),
                             'bad_geocodes.csv')) as f:
            reader = csv.DictReader(f)

            for row in reader:

                text = row['text']

                addr_id, r, parsed = g.parse_and_code(text)

                score = r['score'] if r else None

                print '------', score, addr_id
                print '> ', text
                print '< ', parsed
Exemple #6
0
    def test_ba_geocoding(self):
        from ambry.geo.geocoder import Geocoder

        l = self.bundle.library

        gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition

        g = Geocoder(gp)

        p = l.get('sandiegodata.org-bad_addresses-casnd-addresses').partition

        for row in p.query(
                "SELECT text from addresses where address_id is NULL limit 10"
        ):

            text = row.text

            text = text.replace('La Jolla', 'San Diego')

            addr_id, r, parsed = g.parse_and_code(text)

            score = r['score'] if r else None

            print '------', score, addr_id
            print row.p
            print '> ', text
            print '< ', parsed
Exemple #7
0
    def test_txt_geocoding(self):
        from ambry.geo.geocoder import Geocoder
        import test.support as ts
        import os.path

        import csv

        l = self.bundle.library

        gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition

        city_subs = {'La Jolla': 'San Diego'}

        g = Geocoder(gp, city_subs)

        with open(
                os.path.join(os.path.dirname(ts.__file__),
                             'bad_geocodes.txt')) as f:
            for line in f:
                text = line.strip()

                addr_id, r, parsed = g.parse_and_code(text)

                score = r['score'] if r else None

                print '------', score, addr_id
                print '> ', text
                print '< ', parsed
Exemple #8
0
    def test_geocoding_csv_geocoder(self):
        from ambry.geo.geocoder import Geocoder
        import test.support as ts
        import os.path

        import csv

        l = self.bundle.library

        gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition

        g = Geocoder(gp)

        for row in gp.query(
                "select * from geocoder where number > 0 limit 1000"):

            text = "{number} {dir} {name} {suffix}, {city}, {state} {zip}".format(
                number=row.number,
                name=row.name,
                state=row.state,
                city=row.city if row.city else '',
                dir=row.direction if row.direction != '-' else '',
                suffix=row.suffix if row.suffix != '-' else '',
                zip=row.zip if row.zip > 0 else '')

            addr_id, r, parsed = g.parse_and_code(text)

            if not r:
                score = r['score'] if r else None

                print '------', score, addr_id
                print '> ', text
                print '< ', parsed
Exemple #9
0
    def x_test_crime(self):
        from ambry.geo.address import Parser
        from ambry.geo.geocoder import Geocoder
        import csv
                      
        g = Geocoder(self.bundle.library, addresses_ds='geoaddresses')      
        _,incidents = self.bundle.library.dep('crime')
    
        log_rate = self.bundle.init_log_rate(1000)
    
        p = Parser()

        with open(self.bundle.filesystem.path('errors.csv'), 'wb') as f:
            writer = csv.writer(f)
            
            writer.writerow(['code','arg','block_address','city','number','dir','street','type'])
            
            multi_cities = 0.0
            multi_addr = 0.0
            no_response = 0.0
            for i, inct in enumerate(incidents.query("SELECT * FROM incidents limit 100000")):
                row = dict(inct)
    
                candidates = g.geocode_semiblock(row['blockaddress'], row['city'], 'CA')
    
                if  len(candidates) == 0:
                    no_response += 1
                    self.write_error_row('norsp',0, p,writer,row['blockaddress'], row['city'])
                    continue
                elif  len(candidates) != 1:
                    multi_cities += 1
                    self.write_error_row('mcities',len(candidates), p,writer,row['blockaddress'], row['city'])
                    continue
                  
                s =  candidates.popitem()[1]
     
                if len(s) > 3:
                    self.write_error_row('maddr',len(s), p,writer,row['blockaddress'], row['city'])
                    multi_addr +=1
                
                if i > 0:
                    log_rate("{}  cities={}, {}% addr={}, {}%  nrp={}, {}%".format(i, 
                                                                        multi_cities, int(multi_cities/i * 100), 
                                                                        multi_addr, int(multi_addr/i * 100),
                                                                        no_response, int(no_response/i * 100) ))
Exemple #10
0
    def test_basic(self):

        from ambry.geo.geocoder import Geocoder
        
        g = Geocoder(self.bundle.library)               
                     
        filename = "good_segments"
        f_input =  os.path.join(os.path.dirname(__file__), '../support',filename + '.txt')
        f_output =  os.path.join(os.path.dirname(__file__), '../support',filename + '.out.csv')
                   
        with open(f_input) as f:
            for line in f:
                addr = line.strip()
                r  = g.geocode_address(addr)
                print "==", addr
                print "->",r
                if r:
                    print "  ", r['codedaddress']
Exemple #11
0
    def build_alcohol(self, p):
        from address_parser import Parser
        from ambry.geo.geocoder import Geocoder
        
        gp = self.library.dep('geocoder').partition
        
        g = Geocoder(gp)
        
        ap = Parser()
        
        
        ip = self.library.dep('alcohol').partition
        lr = self.init_log_rate(1000)
        
        with p.inserter() as ins:
            for row in ip.query("SELECT * FROM licenses"):
            
                lr()
                
                if not row['premisesaddress']:
                    continue
                

                try:
                    address_id, result, parsed = g.parse_and_code(row['premisesaddress'])
                except AttributeError as e:
                    print e
                    continue
                    
               
                
                d = parsed.args
                
                d['text'] = str(parsed)
                d['orig_text'] = row['premisesaddress']
                d['source'] = 'alco'
                d['address_id'] = address_id
                if result:
                    d['score'] = result['score']
                
                ins.insert(d)

                
        return True
Exemple #12
0
    def build_masterlist(self, p):
        from address_parser import Parser
        from ambry.geo.geocoder import Geocoder
        
        gp = self.library.dep('geocoder').partition
        
        g = Geocoder(gp)
        
        ap = Parser()
        
        
        ip = self.library.dep('masterlist').partition
        lr = self.init_log_rate(1000)
     
     
        streets = set()
        
        with p.inserter() as ins:
            for row in ip.query("SELECT * FROM businesses WHERE address_id IS NULL"):
            
                row = dict(row)
            
                row['city'] = row['city'].strip().title() if row['city'] else ''
            
                if row['city'].strip().title() == 'La Jolla':
                    row['city'] = 'San Diego'

            
                ps = ap.parse(row['address'], row['city'], row['state'], row['zip'])
                
                try:
                    address_id, result, parsed = g.parse_and_code(str(ps))
                     
                except AttributeError as e:
                    print e
                    raise
                    continue
                    
                
                d = ps.args

                d['text'] = str(ps)
                d['orig_text'] = "{}, {}, {} {}".format(row['address'], row['city'], row['state'], row['zip'])
                d['source'] = 'sdbml'
                d['address_id'] = address_id

                k = (d['direction'], d['name'], d['suffix'])

                if not k in streets:
                    streets.add(k)
                    
                    d['for_testing'] = 'y'

                
                ins.insert(d)
                lr()
                
                #print ps
             
                
        return True
Exemple #13
0
    def build_ck_geocoder(self):
        """Create a crosswalk to CK geocoded addresses, which link to SANDAG data"""
        from ambry.geo.geocoder import Geocoder

        city_subs = {
                    'La Jolla': 'San Diego'
                }

        g = Geocoder(self.library.dep('geocoder').partition, city_subs)
        
        lr = self.init_log_rate(250)
        
        businesses = self.partitions.find(table='businesses')
        
        p = self.partitions.find_or_new(table = 'ck_addresses')
        p.clean()
        
        good = 0
        bad = 0
        
        with p.inserter() as ins:
            
            for i, bus in enumerate(businesses.rows):
        
                row = {
                    'businesses_id' : bus['id']
                }
        
       
                try:
                    # This just lets us know what addresses aren't geocoding. We'll use the faulures
                    # as bad addresses in a geocoder update. 

                    if bus['city']:
                        row['address_id'], result, parsed = g.parse_and_code(bus['address'], 
                                                        city=bus['city'].title(), state = "CA", zip=bus['zip'])
                                                
                        row['parsed_addr'] = "{}, {}, CA {}".format(parsed.text, parsed.locality.city, parsed.locality.zip)

                    if result:
                        row.update(result)
                        row['name'] = ( 
                            row['direction']+' ' if row['direction'] else '' +
                            row['name']+
                            ' '+row['suffix'] if row['suffix'] else ''
                        )
                        row['id'] = None
                        good += 1
                    else:
                        bad += 1

                except Exception as e:
                    self.error("Failed to parse row {}: {} : {} ".format(i, bus['address'], e.message))
                    raise
                  
                lr("Geocode CK: {} good / {} bad ( {}%) of {}".format(good, bad, round(float(good) / float(good+bad) *100,1), good+bad ))
                
                
                ins.insert(row)
                
                if self.run_args.test and i > 500:
                    break