Esempio n. 1
0
    def test_geocode(self):
        from ambry.geo.geocoder import Geocoder
        from address_parser import Parser
        import csv
        import ambry
        import os
        import csv

        l = ambry.library()

        gp = l.get('clarinova.com-geocode-casnd-geocoder').partition

        f_intersections = os.path.join(os.path.dirname(__file__), 'support', 'intersections.csv')

        q = """
        SELECT *
        FROM geocoder
        WHERE name = :name AND  direction = :direction AND suffix = suffix
        """

        p = Parser()

        with open(f_intersections) as f:
            reader = csv.DictReader(f)
            for r in reader:
                ps =  p.parse('1000 '+r['primary_rd'])
                print ps.road.dict

                for qr in gp.query(q,**ps.road.dict):
                    print "    ", qr
Esempio n. 2
0
    def test_geocode(self):
        from ambry.geo.geocoder import Geocoder
        from address_parser import Parser
        import csv
        import ambry
        import os
        import csv

        l = ambry.library()

        gp = l.get('clarinova.com-geocode-casnd-geocoder').partition

        f_intersections = os.path.join(os.path.dirname(__file__), 'support',
                                       'intersections.csv')

        q = """
        SELECT *
        FROM geocoder
        WHERE name = :name AND  direction = :direction AND suffix = suffix
        """

        p = Parser()

        with open(f_intersections) as f:
            reader = csv.DictReader(f)
            for r in reader:
                ps = p.parse('1000 ' + r['primary_rd'])
                print ps.road.dict

                for qr in gp.query(q, **ps.road.dict):
                    print "    ", qr
Esempio n. 3
0
    def test_address_files(self):
        import os
        from address_parser import Parser
        import csv

        parser = Parser()

        success = 0
        failure = 0
        total = 0
        filename = "crime_addresses"
        f_input = os.path.join(os.path.dirname(__file__), 'support',
                               filename + '.txt')
        f_output = os.path.join(os.path.dirname(__file__), 'support',
                                filename + '.out.csv')
        with open(f_output, 'w') as out:
            writer = csv.DictWriter(out, self.header)
            writer.writeheader()
            with open(f_input) as f:
                for line in f:

                    total += 1

                    print '----'
                    print line.strip()

                    try:
                        ps = parser.parse(line)
                        if not ps:
                            failure += 1
                            continue
                    except Exception as e:
                        print "ERROR", e
                        failure += 1
                        continue

                    print ps
                    continue

                    d = ps.dict
                    d['input'] = line.strip()
                    d['output'] = str(ps)
                    #writer.writerow(d)
                    print d.keys()
                    if not ps.city:
                        failure += 1
                        print d
                        print ps
                        print
                    else:

                        success += 1

            print
            print "total={} success={} failure={} rate={}".format(
                total, success, failure,
                round((float(failure) / float(total) * 100), 3))
Esempio n. 4
0
    def test_hash(self):
        from pprint import pprint

        a1 = '119 WEST WINTON AVENUE, HAYWARD, CA, 94544'
        a2 = '119 Winton Ave., Hayward, Ca, 94544-5000'

        parser = Parser()
        r = parser.parse(a2)

        pprint(r.dict)
Esempio n. 5
0
    def test_address_files(self):
        import os
        from address_parser import Parser
        import csv

        parser = Parser()

        success = 0
        failure = 0
        total = 0
        filename = "crime_addresses"
        f_input = os.path.join(os.path.dirname(__file__), 'support', filename + '.txt')
        f_output = os.path.join(os.path.dirname(__file__), 'support', filename + '.out.csv')
        with open(f_output, 'w') as out:
            writer = csv.DictWriter(out, self.header)
            writer.writeheader()
            with open(f_input) as f:
                for line in f:

                    total += 1

                    print '----'
                    print line.strip()

                    try:
                        ps = parser.parse(line)
                        if not ps:
                            failure += 1
                            continue
                    except Exception as e:
                        print "ERROR", e
                        failure += 1
                        continue

                    print ps
                    continue

                    d = ps.dict
                    d['input'] = line.strip()
                    d['output'] = str(ps)
                    # writer.writerow(d)
                    print d.keys()
                    if not ps.city:
                        failure += 1
                        print d
                        print ps
                        print
                    else:

                        success += 1

            print
            print "total={} success={} failure={} rate={}".format(total, success, failure,
                                                                  round((float(failure) / float(total) * 100), 3))
Esempio n. 6
0
        def address_gen():
            """Produce blocks addresses that are randomized within the 100 block, 
            if possible, and return the original address if it isn't """
            
            from random import randint
            from address_parser import Parser
            
            parser = Parser()
            
            p = self.partition(table='crimeb')
            
            for row in p:
                
                if not row.block_address:
                    continue

                ba = str(row.block_address).replace('EL CAM', 'El Camino')

                ps = parser.parse(ba)

                if ps and ps.number.number and ps.number.number > 0:
                    ps.number.number = \
                        int( round(ps.number.number, -2)) + \
                        randint(0,100)

                    street_num = str(ps)
                else:
                    street_num = ba.replace('BLOCK', '')
                    

                city = row.city
                
                if not row.city and row.agency != 'SHERRIF':
                    city = row.agency
                
                if not city:
                    city = ''
                
                zipcode = ', {}'.format(row.zipcode) if row.zipcode else ''
                
                address = '{} {} CA{}'.format(street_num, city, zipcode)
                
                yield (address,row)
Esempio n. 7
0
def chunked_geocode(addresses, state=None, chunk_size=250):

    # Each address entry must be a tuple of (unique_id, address)

    parser = Parser()

    row_n = 0

    request_rows = []

    for uid, address_line in addresses:

        p = parser.parse(address_line)

        rr = [
            uid,
            p.street_str(), p.locality.city, state or p.locality.state,
            p.locality.zip
        ]

        request_rows.append(rr)

        if len(request_rows) > chunk_size:

            for row in make_request(request_rows):
                # row colums are:
                # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips
                yield row_n, True, mkdict(row)
                row_n += 1

            request_rows = []

    for row in make_request(request_rows):
        # row colums are:
        # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips
        yield row_n, True, mkdict(row)
        row_n += 1
Esempio n. 8
0
class Geocoder(object):

    def __init__(self, partition, city_subs=None):
        from address_parser import Parser

        self.p = partition

        self.address_cache = {}

        self.city_subs = {
            k.lower(): v for k,
            v in city_subs.items()} if city_subs else {}

        self.parser = Parser()

    def parse_and_code(self, addrstr, city=None, state=None, zip=None):

        adr = self.parser.parse(addrstr, city=city, state=state, zip=zip)

        if adr.hash in self.address_cache:
            r = self.address_cache[adr.hash]
            if r:
                address_id = r['address_id']
            else:
                address_id = None

        else:
            r = self.geocode(**adr.args)

            if r:
                address_id = r['address_id']
                self.address_cache[adr.hash] = r
            else:
                self.address_cache[adr.hash] = None
                address_id = None

        return address_id, r, adr

    def geocode(self, number, name, direction=None,
                suffix=None, city=None, state=None, zip=None):
        """Return a record from the geocoder table.

        This function expects a partition, p, that holds a table named
        'gecoder', of the same structure as used in clarinova.com-geocode-casnd

        """

        direction = direction.upper() if direction else '-'
        suffix = suffix.title() if suffix else '-'
        city = city.title() if city else '-'

        if city.lower() in self.city_subs:
            city = self.city_subs[city.lower()].title()

        if isinstance(zip, basestring) and '-' in zip:
            zip, zsuffix = zip.split('-')

        zip = zip if zip else -1

        try:
            zip = int(zip)
        except:
            zip = -1

        suffix = suffix.lower()

        # We don't need to check for nulls in direction, b/c entries without
        # directions have the value '-'
        q = """
        SELECT
            *,
            (
                CASE WHEN city = :city THEN 10 ELSE 0 END +
                CASE WHEN zip = :zip THEN 10 ELSE 0 END +
                CASE WHEN suffix = :suffix THEN 10 ELSE 0 END
            ) AS score,
            ABS(number - :number) as ndist

        FROM geocoder
        WHERE  name = :name AND direction = :direction
        AND score >= 20
        AND number BETWEEN (:number-100) AND (:number+100)
        ORDER BY ABS(number - :number), score LIMIT 1;
        """

        r = self.p.query(
            q,
            number=number,
            name=name,
            direction=direction,
            suffix=suffix,
            city=city,
            state=state,
            zip=zip).first()

        if not r:
            return None

        r = dict(r)
        r['confidence'] = round(
            (100.0 - (30.0 - r['score']) - (r['ndist'] / 2.0)) / 100.0, 3)
        r['lat'] = float(r['lat']) / 100000000.0
        r['lon'] = float(r['lon']) / 100000000.0
        return r

    def geocode_intersection(self, street1, street2):
        pass
Esempio n. 9
0
with open('tickets.pkl', 'rb') as f:
    tickets_acc = pickle.load(f)

from collections import defaultdict

d = defaultdict(lambda : [0,0])

def mkstreet(number, name, suffix):
    return "{} {} {}".format(number,name, suffix)


# Create a dict of date/street pairs, then mark them for if the pair
# was swept, then if the pair was ticketed
for base_street, dates in gps_acc.items():
    ps = parser.parse(base_street)
    
    # Expand each street block to the 100 block before and after, to deal 
    # with possible missing GPS reverse-geocodes
    streets = [mkstreet(ps.number.number, ps.road.name, ps.road.suffix),
              mkstreet(ps.number.number+100, ps.road.name, ps.road.suffix),
              mkstreet(ps.number.number+200, ps.road.name, ps.road.suffix),
              mkstreet(ps.number.number+300, ps.road.name, ps.road.suffix)
          ]
    
    if ps.number.number >= 100:
        streets.append(mkstreet(ps.number.number-100, ps.road.name, ps.road.suffix))
        
    if ps.number.number >= 200:
        streets.append(mkstreet(ps.number.number-200, ps.road.name, ps.road.suffix))
        
Esempio n. 10
0
    def build_masterlist(self, p):
        from address_parser import Parser
        from ambry.geo.geocoder import Geocoder
        
        gp = self.library.dep('geocoder').partition
        
        g = Geocoder(gp)
        
        ap = Parser()
        
        
        ip = self.library.dep('masterlist').partition
        lr = self.init_log_rate(1000)
     
     
        streets = set()
        
        with p.inserter() as ins:
            for row in ip.query("SELECT * FROM businesses WHERE address_id IS NULL"):
            
                row = dict(row)
            
                row['city'] = row['city'].strip().title() if row['city'] else ''
            
                if row['city'].strip().title() == 'La Jolla':
                    row['city'] = 'San Diego'

            
                ps = ap.parse(row['address'], row['city'], row['state'], row['zip'])
                
                try:
                    address_id, result, parsed = g.parse_and_code(str(ps))
                     
                except AttributeError as e:
                    print e
                    raise
                    continue
                    
                
                d = ps.args

                d['text'] = str(ps)
                d['orig_text'] = "{}, {}, {} {}".format(row['address'], row['city'], row['state'], row['zip'])
                d['source'] = 'sdbml'
                d['address_id'] = address_id

                k = (d['direction'], d['name'], d['suffix'])

                if not k in streets:
                    streets.add(k)
                    
                    d['for_testing'] = 'y'

                
                ins.insert(d)
                lr()
                
                #print ps
             
                
        return True
Esempio n. 11
0
from address_parser import Parser
import re


addresss = '387 View Ave apt4 Twin Falls, ID 83301'

info = 'Email:  [email protected] Phone:  +1 (956) 8574114'





email = re.findall('\S+@\S+', info) 

ok = re.findall('\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}', info)

parser = Parser()
adr = parser.parse(addresss)

print(f'{adr.number.number} {adr.road.direction} {adr.road.name} {adr.road.suffix}')
print(f'{adr.text}')
print(f'{email[0]}')
print(f'{ok[0]}')
Esempio n. 12
0
import time

f = MPRowsFile('/Users/eric/proj/virt/ambry10/library/build/nbcuni.com/streetsweep/nbcuni.com/streetsweep-0.0.1/tickets.mpr')

parser = Parser()

start = time.time()
s = 0 
from collections import defaultdict
acc = defaultdict(set)
with f.reader as r:
    for i, row in enumerate(r, 1):

        adr = row.locationdesc1
        if adr:
            ps = parser.parse(adr)
            dt = row.issuedate
            if ps.number.number > 0 and dt:
                number = int(ps.number.number / 100) * 100
                
                key = "{} {} {}".format(number, ps.road.name, ps.road.suffix)
                print row
                acc[key].add(dt)
        
        if i % 10000 == 0:
          
            print i, len(acc), round( float(i) / ( time.time() - start), 2) 

with open('tickets.pkl', 'wb') as f:
    pickle.dump(acc, f)
        
    def get_pubmed_data(self,
                        query,
                        searched_zipcode,
                        date,
                        maximum_number_of_value=3):
        csv_data = {
            "affiliation": [],
            "number_of_authors": [],
            "authors_name": [],
            "authors_institute": [],
            "authors_address": [],
            "authors_zipcode": [],
            "paper_title": [],
            "publication_date": [],
            "journal": []
        }
        pubmed = PubMed(tool="MyTool", email="*****@*****.**")
        parser = Parser()

        results = pubmed.query(query, max_results=maximum_number_of_value)
        is_queried_by_zipcode = searched_zipcode.isdecimal()

        if is_queried_by_zipcode:
            searched_zipcode = int(searched_zipcode)

        for article in results:
            jsonData = json.loads(article.toJSON())
            authors_list = jsonData['authors']
            authors_name = ""
            authors_institute = ""
            authors_affiliation = ""
            authors_address = ""
            authors_zipcode = ""
            num_authors = len(authors_list) or 0
            counted_matched = 0
            if is_queried_by_zipcode:
                counted_matched = self.has_match_zipcode_of_authprs(
                    authors_list, searched_zipcode)
            if (not is_queried_by_zipcode) or (is_queried_by_zipcode
                                               and counted_matched > 0):
                for index in range(0, num_authors):
                    affiliation = authors_list[index][
                        "affiliation"] or "<NOT_AVAILABLE>"
                    zipcode = str(self.get_address_with_zipcode(affiliation))
                    # print(type(zipcode))
                    # print(zipcode)
                    author_name = authors_list[index][
                        'firstname'] + " " + authors_list[index][
                            "lastname"] or "<NOT_AVAILABLE>"
                    author_institute = ""
                    author_institute += self.get_organization(
                        affiliation=affiliation) + " "
                    authors_affiliation += affiliation
                    authors_name += author_name
                    authors_institute += author_institute
                    authors_address += str(parser.parse(affiliation))
                    authors_zipcode += zipcode
                    if num_authors != index + 1:
                        authors_name += "||"
                        authors_institute += "||"
                        authors_affiliation += "||"
                        authors_address += "||"
                        authors_zipcode += "||"
            else:
                break
            paper_title = jsonData['title'] or "<NOT_AVAILABLE>"
            publication_date = jsonData['publication_date'] or "<NOT_AVAILABLE>"
            journal = jsonData['journal'] or "<NOT_AVAILABLE>"

            if self.is_us:
                if not is_queried_by_zipcode or (is_queried_by_zipcode
                                                 and counted_matched > 0):

                    csv_data["authors_name"].append(authors_name)
                    csv_data["affiliation"].append(authors_affiliation)
                    csv_data["authors_institute"].append(authors_institute)
                    csv_data["paper_title"].append(paper_title)
                    csv_data["publication_date"].append(publication_date)
                    csv_data["journal"].append(journal)
                    csv_data["authors_address"].append(authors_address)
                    csv_data["number_of_authors"].append(num_authors)
                    csv_data["authors_zipcode"].append(authors_zipcode)
                    self.is_us = False

            # if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0):
            #
            #     df = pd.DataFrame(csv_data)
            #     # print(df.head())
            #     df.to_csv("PubMedData_from.csv", index=False)

        print("Size of csv ", len(csv_data["paper_title"]))
        if len(csv_data["paper_title"]) > 0:
            df = pd.DataFrame(csv_data)
            print(df.head())
            datetimeobject = datetime.datetime.strptime(date, '%Y/%m/%d')
            csv_file_name = "PubMedData_From_" + datetimeobject.strftime(
                '%Y_%m_%d') + ".csv"
            print(csv_file_name)
            df.to_csv(csv_file_name, index=False)
Esempio n. 14
0
class Geocoder(object):

    def __init__(self, partition, city_subs=None):
        from address_parser import Parser

        self.p = partition

        self.address_cache = {}

        self.city_subs = {
            k.lower(): v for k,
            v in city_subs.items()} if city_subs else {}

        self.parser = Parser()

    def parse_and_code(self, addrstr, city=None, state=None, zip=None):

        adr = self.parser.parse(addrstr, city=city, state=state, zip=zip)

        if adr.hash in self.address_cache:
            r = self.address_cache[adr.hash]
            if r:
                address_id = r['address_id']
            else:
                address_id = None

        else:
            r = self.geocode(**adr.args)

            if r:
                address_id = r['address_id']
                self.address_cache[adr.hash] = r
            else:
                self.address_cache[adr.hash] = None
                address_id = None

        return address_id, r, adr

    def geocode(self, number, name, direction=None,
                suffix=None, city=None, state=None, zip=None):
        """Return a record from the geocoder table.

        This function expects a partition, p, that holds a table named 'gecoder',
        of the same structure as used in clarinova.com-geocode-casnd

        """

        direction = direction.upper() if direction else '-'
        suffix = suffix.title() if suffix else '-'
        city = city.title() if city else '-'

        if city.lower() in self.city_subs:
            city = self.city_subs[city.lower()].title()

        if isinstance(zip, basestring) and '-' in zip:
            zip, zsuffix = zip.split('-')

        zip = zip if zip else -1

        try:
            zip = int(zip)
        except:
            zip = -1

        suffix = suffix.lower()

        # We don't need to check for nulls in direction, b/c entries without
        # directions have the value '-'
        q = """
        SELECT
            *,
            (
                CASE WHEN city = :city THEN 10 ELSE 0 END +
                CASE WHEN zip = :zip THEN 10 ELSE 0 END +
                CASE WHEN suffix = :suffix THEN 10 ELSE 0 END
            ) AS score,
            ABS(number - :number) as ndist

        FROM geocoder
        WHERE  name = :name AND direction = :direction
        AND score >= 20
        AND number BETWEEN (:number-100) AND (:number+100)
        ORDER BY ABS(number - :number), score LIMIT 1;
        """

        r = self.p.query(
            q,
            number=number,
            name=name,
            direction=direction,
            suffix=suffix,
            city=city,
            state=state,
            zip=zip).first()

        if not r:
            return None

        r = dict(r)
        r['confidence'] = round(
            (100.0 - (30.0 - r['score']) - (r['ndist'] / 2.0)) / 100.0, 3)
        r['lat'] = float(r['lat']) / 100000000.0
        r['lon'] = float(r['lon']) / 100000000.0
        return r

    def geocode_intersection(self, street1, street2):
        pass
Esempio n. 15
0
    def test_address_files(self):
        import os           

        import csv

        parser = Parser()
    
        success = 0
        failure = 0
        total = 0

        for filename in ["crime_addresses"]:
            f_input =  os.path.join(os.path.dirname(__file__), 'support',filename + '.txt')
            f_output =  os.path.join(os.path.dirname(__file__), 'support',filename + '.out.csv')
            with open(f_output, 'w') as out:
                writer = csv.DictWriter(out, self.header)
                writer.writeheader()
                with open(f_input) as f:
                    for line in f:
             
                        total += 1
             
            
                        try: 
                            ps = parser.parse(line)
                            if not ps:
                                failure += 1
                                continue
                        except TypeError:
                            raise
                        except Exception as e:
                            print("ERROR", e)

                            failure += 1
                            continue

                        d = ps.dict
                        d['input'] = line.strip()
                        d['output'] = str(ps)
                    
                        d2 = dict(d.items())
                        del d2['hash']
                        del d2['locality']
                        del d2['text']
                        del d2['road']
                        writer.writerow(d2)
                 
                        # THe parser strips 'BLOCK', and '/' is an intersection
                        if line.strip() != str(ps) and 'block' not in line.lower() and '/' not in line:
                            failure += 1
                            print('-----')
                            print(line.strip())
                            print(ps)

                            print()
                        else:
 
                            success += 1
                
            print ()
            print ("total={} success={} failure={} rate={}".format(total, success, failure, round((float(failure)/float(total)*100), 3)))