Ejemplo n.º 1
0
    def test_geocode(self):
        from ambry.geo.geocoder import Geocoder
        from address_parser import Parser
        import csv
        import ambry
        import os
        import csv

        l = ambry.library()

        gp = l.get('clarinova.com-geocode-casnd-geocoder').partition

        f_intersections = os.path.join(os.path.dirname(__file__), 'support', 'intersections.csv')

        q = """
        SELECT *
        FROM geocoder
        WHERE name = :name AND  direction = :direction AND suffix = suffix
        """

        p = Parser()

        with open(f_intersections) as f:
            reader = csv.DictReader(f)
            for r in reader:
                ps =  p.parse('1000 '+r['primary_rd'])
                print ps.road.dict

                for qr in gp.query(q,**ps.road.dict):
                    print "    ", qr
Ejemplo n.º 2
0
    def test_geocode(self):
        from ambry.geo.geocoder import Geocoder
        from address_parser import Parser
        import csv
        import ambry
        import os
        import csv

        l = ambry.library()

        gp = l.get('clarinova.com-geocode-casnd-geocoder').partition

        f_intersections = os.path.join(os.path.dirname(__file__), 'support',
                                       'intersections.csv')

        q = """
        SELECT *
        FROM geocoder
        WHERE name = :name AND  direction = :direction AND suffix = suffix
        """

        p = Parser()

        with open(f_intersections) as f:
            reader = csv.DictReader(f)
            for r in reader:
                ps = p.parse('1000 ' + r['primary_rd'])
                print ps.road.dict

                for qr in gp.query(q, **ps.road.dict):
                    print "    ", qr
Ejemplo n.º 3
0
    def test_address_files(self):
        import os
        from address_parser import Parser
        import csv

        parser = Parser()

        success = 0
        failure = 0
        total = 0
        filename = "crime_addresses"
        f_input = os.path.join(os.path.dirname(__file__), 'support',
                               filename + '.txt')
        f_output = os.path.join(os.path.dirname(__file__), 'support',
                                filename + '.out.csv')
        with open(f_output, 'w') as out:
            writer = csv.DictWriter(out, self.header)
            writer.writeheader()
            with open(f_input) as f:
                for line in f:

                    total += 1

                    print '----'
                    print line.strip()

                    try:
                        ps = parser.parse(line)
                        if not ps:
                            failure += 1
                            continue
                    except Exception as e:
                        print "ERROR", e
                        failure += 1
                        continue

                    print ps
                    continue

                    d = ps.dict
                    d['input'] = line.strip()
                    d['output'] = str(ps)
                    #writer.writerow(d)
                    print d.keys()
                    if not ps.city:
                        failure += 1
                        print d
                        print ps
                        print
                    else:

                        success += 1

            print
            print "total={} success={} failure={} rate={}".format(
                total, success, failure,
                round((float(failure) / float(total) * 100), 3))
Ejemplo n.º 4
0
    def test_hash(self):
        from pprint import pprint

        a1 = '119 WEST WINTON AVENUE, HAYWARD, CA, 94544'
        a2 = '119 Winton Ave., Hayward, Ca, 94544-5000'

        parser = Parser()
        r = parser.parse(a2)

        pprint(r.dict)
Ejemplo n.º 5
0
    def test_address_files(self):
        import os
        from address_parser import Parser
        import csv

        parser = Parser()

        success = 0
        failure = 0
        total = 0
        filename = "crime_addresses"
        f_input = os.path.join(os.path.dirname(__file__), 'support', filename + '.txt')
        f_output = os.path.join(os.path.dirname(__file__), 'support', filename + '.out.csv')
        with open(f_output, 'w') as out:
            writer = csv.DictWriter(out, self.header)
            writer.writeheader()
            with open(f_input) as f:
                for line in f:

                    total += 1

                    print '----'
                    print line.strip()

                    try:
                        ps = parser.parse(line)
                        if not ps:
                            failure += 1
                            continue
                    except Exception as e:
                        print "ERROR", e
                        failure += 1
                        continue

                    print ps
                    continue

                    d = ps.dict
                    d['input'] = line.strip()
                    d['output'] = str(ps)
                    # writer.writerow(d)
                    print d.keys()
                    if not ps.city:
                        failure += 1
                        print d
                        print ps
                        print
                    else:

                        success += 1

            print
            print "total={} success={} failure={} rate={}".format(total, success, failure,
                                                                  round((float(failure) / float(total) * 100), 3))
Ejemplo n.º 6
0
    def __init__(self, partition, city_subs=None):
        from address_parser import Parser

        self.p = partition

        self.address_cache = {}

        self.city_subs = {
            k.lower(): v for k,
            v in city_subs.items()} if city_subs else {}

        self.parser = Parser()
Ejemplo n.º 7
0
        def address_gen():
            """Produce blocks addresses that are randomized within the 100 block, 
            if possible, and return the original address if it isn't """
            
            from random import randint
            from address_parser import Parser
            
            parser = Parser()
            
            p = self.partition(table='crimeb')
            
            for row in p:
                
                if not row.block_address:
                    continue

                ba = str(row.block_address).replace('EL CAM', 'El Camino')

                ps = parser.parse(ba)

                if ps and ps.number.number and ps.number.number > 0:
                    ps.number.number = \
                        int( round(ps.number.number, -2)) + \
                        randint(0,100)

                    street_num = str(ps)
                else:
                    street_num = ba.replace('BLOCK', '')
                    

                city = row.city
                
                if not row.city and row.agency != 'SHERRIF':
                    city = row.agency
                
                if not city:
                    city = ''
                
                zipcode = ', {}'.format(row.zipcode) if row.zipcode else ''
                
                address = '{} {} CA{}'.format(street_num, city, zipcode)
                
                yield (address,row)
Ejemplo n.º 8
0
    def __init__(self, partition, city_subs=None):
        from address_parser import Parser

        self.p = partition

        self.address_cache = {}

        self.city_subs = {
            k.lower(): v for k,
            v in city_subs.items()} if city_subs else {}

        self.parser = Parser()
Ejemplo n.º 9
0
def chunked_geocode(addresses, state=None, chunk_size=250):

    # Each address entry must be a tuple of (unique_id, address)

    parser = Parser()

    row_n = 0

    request_rows = []

    for uid, address_line in addresses:

        p = parser.parse(address_line)

        rr = [
            uid,
            p.street_str(), p.locality.city, state or p.locality.state,
            p.locality.zip
        ]

        request_rows.append(rr)

        if len(request_rows) > chunk_size:

            for row in make_request(request_rows):
                # row colums are:
                # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips
                yield row_n, True, mkdict(row)
                row_n += 1

            request_rows = []

    for row in make_request(request_rows):
        # row colums are:
        # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips
        yield row_n, True, mkdict(row)
        row_n += 1
Ejemplo n.º 10
0
    def build_masterlist(self, p):
        from address_parser import Parser
        from ambry.geo.geocoder import Geocoder
        
        gp = self.library.dep('geocoder').partition
        
        g = Geocoder(gp)
        
        ap = Parser()
        
        
        ip = self.library.dep('masterlist').partition
        lr = self.init_log_rate(1000)
     
     
        streets = set()
        
        with p.inserter() as ins:
            for row in ip.query("SELECT * FROM businesses WHERE address_id IS NULL"):
            
                row = dict(row)
            
                row['city'] = row['city'].strip().title() if row['city'] else ''
            
                if row['city'].strip().title() == 'La Jolla':
                    row['city'] = 'San Diego'

            
                ps = ap.parse(row['address'], row['city'], row['state'], row['zip'])
                
                try:
                    address_id, result, parsed = g.parse_and_code(str(ps))
                     
                except AttributeError as e:
                    print e
                    raise
                    continue
                    
                
                d = ps.args

                d['text'] = str(ps)
                d['orig_text'] = "{}, {}, {} {}".format(row['address'], row['city'], row['state'], row['zip'])
                d['source'] = 'sdbml'
                d['address_id'] = address_id

                k = (d['direction'], d['name'], d['suffix'])

                if not k in streets:
                    streets.add(k)
                    
                    d['for_testing'] = 'y'

                
                ins.insert(d)
                lr()
                
                #print ps
             
                
        return True
Ejemplo n.º 11
0
import pandas as pd
from scrapy.selector import Selector
from bs4 import BeautifulSoup
import re
import time
import requests
import xml.etree.ElementTree as etree
import pyap
from urllib.parse import urlparse
from selenium import webdriver
from shutil import which
from scrapy_selenium import SeleniumRequest
from address_parser import Parser
import requests

address_parser = Parser()

#yield SeleniumRequest(url=url, callback=self.parse_result)
"""SELENIUM_DRIVER_NAME = 'chrome'
driver_path = which('/home/val/coding/chromedriver')
SELENIUM_DRIVER_ARGUMENTS=['-headless']"""

options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(executable_path='/home/val/coding/chromedriver',
                          options=options)


def fill_data_base(frame):
    # Automatic filling the data base with scraped information
Ejemplo n.º 12
0
from address_parser import Parser
import re


addresss = '387 View Ave apt4 Twin Falls, ID 83301'

info = 'Email:  [email protected] Phone:  +1 (956) 8574114'





email = re.findall('\S+@\S+', info) 

ok = re.findall('\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}', info)

parser = Parser()
adr = parser.parse(addresss)

print(f'{adr.number.number} {adr.road.direction} {adr.road.name} {adr.road.suffix}')
print(f'{adr.text}')
print(f'{email[0]}')
print(f'{ok[0]}')
Ejemplo n.º 13
0
print(query)
# Execute the query against the API
results = pubmed.query(query, max_results=3)

# Loop over the retrieved articles
for article in results:

    # Print the type of object we've found (can be either PubMedBookArticle or PubMedArticle)
    # print(type(article))

    # Print a JSON representation of the object
    #print(article.toJSON())
    # parse x:
    jsonData = json.loads(article.toJSON())

    # the result is a Python dictionary:
    # print(jsonData['authors'])
    authors_list = jsonData['authors']
    parser = Parser()
    for index in range(0, len(authors_list)):
        print("affiliation ", authors_list[index]["affiliation"])
        # print("Author number ",index," ", authors_list[index]['firstname']," ",authors_list[index]["lastname"])
        # line = re.sub(r"[\w\W]* ((Hospital|University|Centre|Law School|School|Academy|Department)[\w -]*)[\w\W]*$",
        #               r"\1", authors_list[index]["affiliation"])
        # print("Institute name ",line)
        affiliation = authors_list[index]["affiliation"]
        # adr = parser.parse(affiliation)
        # print(adr.dict)
        # print(adr)
Ejemplo n.º 14
0
class Geocoder(object):

    def __init__(self, partition, city_subs=None):
        from address_parser import Parser

        self.p = partition

        self.address_cache = {}

        self.city_subs = {
            k.lower(): v for k,
            v in city_subs.items()} if city_subs else {}

        self.parser = Parser()

    def parse_and_code(self, addrstr, city=None, state=None, zip=None):

        adr = self.parser.parse(addrstr, city=city, state=state, zip=zip)

        if adr.hash in self.address_cache:
            r = self.address_cache[adr.hash]
            if r:
                address_id = r['address_id']
            else:
                address_id = None

        else:
            r = self.geocode(**adr.args)

            if r:
                address_id = r['address_id']
                self.address_cache[adr.hash] = r
            else:
                self.address_cache[adr.hash] = None
                address_id = None

        return address_id, r, adr

    def geocode(self, number, name, direction=None,
                suffix=None, city=None, state=None, zip=None):
        """Return a record from the geocoder table.

        This function expects a partition, p, that holds a table named
        'gecoder', of the same structure as used in clarinova.com-geocode-casnd

        """

        direction = direction.upper() if direction else '-'
        suffix = suffix.title() if suffix else '-'
        city = city.title() if city else '-'

        if city.lower() in self.city_subs:
            city = self.city_subs[city.lower()].title()

        if isinstance(zip, basestring) and '-' in zip:
            zip, zsuffix = zip.split('-')

        zip = zip if zip else -1

        try:
            zip = int(zip)
        except:
            zip = -1

        suffix = suffix.lower()

        # We don't need to check for nulls in direction, b/c entries without
        # directions have the value '-'
        q = """
        SELECT
            *,
            (
                CASE WHEN city = :city THEN 10 ELSE 0 END +
                CASE WHEN zip = :zip THEN 10 ELSE 0 END +
                CASE WHEN suffix = :suffix THEN 10 ELSE 0 END
            ) AS score,
            ABS(number - :number) as ndist

        FROM geocoder
        WHERE  name = :name AND direction = :direction
        AND score >= 20
        AND number BETWEEN (:number-100) AND (:number+100)
        ORDER BY ABS(number - :number), score LIMIT 1;
        """

        r = self.p.query(
            q,
            number=number,
            name=name,
            direction=direction,
            suffix=suffix,
            city=city,
            state=state,
            zip=zip).first()

        if not r:
            return None

        r = dict(r)
        r['confidence'] = round(
            (100.0 - (30.0 - r['score']) - (r['ndist'] / 2.0)) / 100.0, 3)
        r['lat'] = float(r['lat']) / 100000000.0
        r['lon'] = float(r['lon']) / 100000000.0
        return r

    def geocode_intersection(self, street1, street2):
        pass
Ejemplo n.º 15
0
class Geocoder(object):

    def __init__(self, partition, city_subs=None):
        from address_parser import Parser

        self.p = partition

        self.address_cache = {}

        self.city_subs = {
            k.lower(): v for k,
            v in city_subs.items()} if city_subs else {}

        self.parser = Parser()

    def parse_and_code(self, addrstr, city=None, state=None, zip=None):

        adr = self.parser.parse(addrstr, city=city, state=state, zip=zip)

        if adr.hash in self.address_cache:
            r = self.address_cache[adr.hash]
            if r:
                address_id = r['address_id']
            else:
                address_id = None

        else:
            r = self.geocode(**adr.args)

            if r:
                address_id = r['address_id']
                self.address_cache[adr.hash] = r
            else:
                self.address_cache[adr.hash] = None
                address_id = None

        return address_id, r, adr

    def geocode(self, number, name, direction=None,
                suffix=None, city=None, state=None, zip=None):
        """Return a record from the geocoder table.

        This function expects a partition, p, that holds a table named 'gecoder',
        of the same structure as used in clarinova.com-geocode-casnd

        """

        direction = direction.upper() if direction else '-'
        suffix = suffix.title() if suffix else '-'
        city = city.title() if city else '-'

        if city.lower() in self.city_subs:
            city = self.city_subs[city.lower()].title()

        if isinstance(zip, basestring) and '-' in zip:
            zip, zsuffix = zip.split('-')

        zip = zip if zip else -1

        try:
            zip = int(zip)
        except:
            zip = -1

        suffix = suffix.lower()

        # We don't need to check for nulls in direction, b/c entries without
        # directions have the value '-'
        q = """
        SELECT
            *,
            (
                CASE WHEN city = :city THEN 10 ELSE 0 END +
                CASE WHEN zip = :zip THEN 10 ELSE 0 END +
                CASE WHEN suffix = :suffix THEN 10 ELSE 0 END
            ) AS score,
            ABS(number - :number) as ndist

        FROM geocoder
        WHERE  name = :name AND direction = :direction
        AND score >= 20
        AND number BETWEEN (:number-100) AND (:number+100)
        ORDER BY ABS(number - :number), score LIMIT 1;
        """

        r = self.p.query(
            q,
            number=number,
            name=name,
            direction=direction,
            suffix=suffix,
            city=city,
            state=state,
            zip=zip).first()

        if not r:
            return None

        r = dict(r)
        r['confidence'] = round(
            (100.0 - (30.0 - r['score']) - (r['ndist'] / 2.0)) / 100.0, 3)
        r['lat'] = float(r['lat']) / 100000000.0
        r['lon'] = float(r['lon']) / 100000000.0
        return r

    def geocode_intersection(self, street1, street2):
        pass
Ejemplo n.º 16
0
from ambry_sources.mpf import MPRowsFile
from address_parser import Parser
import cPickle as pickle

from address_parser import Parser
parser = Parser()

with open('gps_dump.pkl', 'rb') as f:
    gps_acc = pickle.load(f)

with open('tickets.pkl', 'rb') as f:
    tickets_acc = pickle.load(f)

from collections import defaultdict

d = defaultdict(lambda : [0,0])

def mkstreet(number, name, suffix):
    return "{} {} {}".format(number,name, suffix)


# Create a dict of date/street pairs, then mark them for if the pair
# was swept, then if the pair was ticketed
for base_street, dates in gps_acc.items():
    ps = parser.parse(base_street)
    
    # Expand each street block to the 100 block before and after, to deal 
    # with possible missing GPS reverse-geocodes
    streets = [mkstreet(ps.number.number, ps.road.name, ps.road.suffix),
              mkstreet(ps.number.number+100, ps.road.name, ps.road.suffix),
    def get_pubmed_data(self,
                        query,
                        searched_zipcode,
                        date,
                        maximum_number_of_value=3):
        csv_data = {
            "affiliation": [],
            "number_of_authors": [],
            "authors_name": [],
            "authors_institute": [],
            "authors_address": [],
            "authors_zipcode": [],
            "paper_title": [],
            "publication_date": [],
            "journal": []
        }
        pubmed = PubMed(tool="MyTool", email="*****@*****.**")
        parser = Parser()

        results = pubmed.query(query, max_results=maximum_number_of_value)
        is_queried_by_zipcode = searched_zipcode.isdecimal()

        if is_queried_by_zipcode:
            searched_zipcode = int(searched_zipcode)

        for article in results:
            jsonData = json.loads(article.toJSON())
            authors_list = jsonData['authors']
            authors_name = ""
            authors_institute = ""
            authors_affiliation = ""
            authors_address = ""
            authors_zipcode = ""
            num_authors = len(authors_list) or 0
            counted_matched = 0
            if is_queried_by_zipcode:
                counted_matched = self.has_match_zipcode_of_authprs(
                    authors_list, searched_zipcode)
            if (not is_queried_by_zipcode) or (is_queried_by_zipcode
                                               and counted_matched > 0):
                for index in range(0, num_authors):
                    affiliation = authors_list[index][
                        "affiliation"] or "<NOT_AVAILABLE>"
                    zipcode = str(self.get_address_with_zipcode(affiliation))
                    # print(type(zipcode))
                    # print(zipcode)
                    author_name = authors_list[index][
                        'firstname'] + " " + authors_list[index][
                            "lastname"] or "<NOT_AVAILABLE>"
                    author_institute = ""
                    author_institute += self.get_organization(
                        affiliation=affiliation) + " "
                    authors_affiliation += affiliation
                    authors_name += author_name
                    authors_institute += author_institute
                    authors_address += str(parser.parse(affiliation))
                    authors_zipcode += zipcode
                    if num_authors != index + 1:
                        authors_name += "||"
                        authors_institute += "||"
                        authors_affiliation += "||"
                        authors_address += "||"
                        authors_zipcode += "||"
            else:
                break
            paper_title = jsonData['title'] or "<NOT_AVAILABLE>"
            publication_date = jsonData['publication_date'] or "<NOT_AVAILABLE>"
            journal = jsonData['journal'] or "<NOT_AVAILABLE>"

            if self.is_us:
                if not is_queried_by_zipcode or (is_queried_by_zipcode
                                                 and counted_matched > 0):

                    csv_data["authors_name"].append(authors_name)
                    csv_data["affiliation"].append(authors_affiliation)
                    csv_data["authors_institute"].append(authors_institute)
                    csv_data["paper_title"].append(paper_title)
                    csv_data["publication_date"].append(publication_date)
                    csv_data["journal"].append(journal)
                    csv_data["authors_address"].append(authors_address)
                    csv_data["number_of_authors"].append(num_authors)
                    csv_data["authors_zipcode"].append(authors_zipcode)
                    self.is_us = False

            # if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0):
            #
            #     df = pd.DataFrame(csv_data)
            #     # print(df.head())
            #     df.to_csv("PubMedData_from.csv", index=False)

        print("Size of csv ", len(csv_data["paper_title"]))
        if len(csv_data["paper_title"]) > 0:
            df = pd.DataFrame(csv_data)
            print(df.head())
            datetimeobject = datetime.datetime.strptime(date, '%Y/%m/%d')
            csv_file_name = "PubMedData_From_" + datetimeobject.strftime(
                '%Y_%m_%d') + ".csv"
            print(csv_file_name)
            df.to_csv(csv_file_name, index=False)
Ejemplo n.º 18
0
from ambry_sources.mpf import MPRowsFile
from address_parser import Parser
import cPickle as pickle

import time

f = MPRowsFile('/Users/eric/proj/virt/ambry10/library/build/nbcuni.com/streetsweep/nbcuni.com/streetsweep-0.0.1/tickets.mpr')

parser = Parser()

start = time.time()
s = 0 
from collections import defaultdict
acc = defaultdict(set)
with f.reader as r:
    for i, row in enumerate(r, 1):

        adr = row.locationdesc1
        if adr:
            ps = parser.parse(adr)
            dt = row.issuedate
            if ps.number.number > 0 and dt:
                number = int(ps.number.number / 100) * 100
                
                key = "{} {} {}".format(number, ps.road.name, ps.road.suffix)
                print row
                acc[key].add(dt)
        
        if i % 10000 == 0:
          
Ejemplo n.º 19
0
    def test_address_files(self):
        import os           

        import csv

        parser = Parser()
    
        success = 0
        failure = 0
        total = 0

        for filename in ["crime_addresses"]:
            f_input =  os.path.join(os.path.dirname(__file__), 'support',filename + '.txt')
            f_output =  os.path.join(os.path.dirname(__file__), 'support',filename + '.out.csv')
            with open(f_output, 'w') as out:
                writer = csv.DictWriter(out, self.header)
                writer.writeheader()
                with open(f_input) as f:
                    for line in f:
             
                        total += 1
             
            
                        try: 
                            ps = parser.parse(line)
                            if not ps:
                                failure += 1
                                continue
                        except TypeError:
                            raise
                        except Exception as e:
                            print("ERROR", e)

                            failure += 1
                            continue

                        d = ps.dict
                        d['input'] = line.strip()
                        d['output'] = str(ps)
                    
                        d2 = dict(d.items())
                        del d2['hash']
                        del d2['locality']
                        del d2['text']
                        del d2['road']
                        writer.writerow(d2)
                 
                        # THe parser strips 'BLOCK', and '/' is an intersection
                        if line.strip() != str(ps) and 'block' not in line.lower() and '/' not in line:
                            failure += 1
                            print('-----')
                            print(line.strip())
                            print(ps)

                            print()
                        else:
 
                            success += 1
                
            print ()
            print ("total={} success={} failure={} rate={}".format(total, success, failure, round((float(failure)/float(total)*100), 3)))