Ejemplo n.º 1
0
    def test_geocode(self):
        from ambry.geo.geocoder import Geocoder
        from address_parser import Parser
        import csv
        import ambry
        import os
        import csv

        l = ambry.library()

        gp = l.get('clarinova.com-geocode-casnd-geocoder').partition

        f_intersections = os.path.join(os.path.dirname(__file__), 'support',
                                       'intersections.csv')

        q = """
        SELECT *
        FROM geocoder
        WHERE name = :name AND  direction = :direction AND suffix = suffix
        """

        p = Parser()

        with open(f_intersections) as f:
            reader = csv.DictReader(f)
            for r in reader:
                ps = p.parse('1000 ' + r['primary_rd'])
                print ps.road.dict

                for qr in gp.query(q, **ps.road.dict):
                    print "    ", qr
Ejemplo n.º 2
0
    def test_address_files(self):
        import os
        from address_parser import Parser
        import csv

        parser = Parser()

        success = 0
        failure = 0
        total = 0
        filename = "crime_addresses"
        f_input = os.path.join(os.path.dirname(__file__), 'support',
                               filename + '.txt')
        f_output = os.path.join(os.path.dirname(__file__), 'support',
                                filename + '.out.csv')
        with open(f_output, 'w') as out:
            writer = csv.DictWriter(out, self.header)
            writer.writeheader()
            with open(f_input) as f:
                for line in f:

                    total += 1

                    print '----'
                    print line.strip()

                    try:
                        ps = parser.parse(line)
                        if not ps:
                            failure += 1
                            continue
                    except Exception as e:
                        print "ERROR", e
                        failure += 1
                        continue

                    print ps
                    continue

                    d = ps.dict
                    d['input'] = line.strip()
                    d['output'] = str(ps)
                    #writer.writerow(d)
                    print d.keys()
                    if not ps.city:
                        failure += 1
                        print d
                        print ps
                        print
                    else:

                        success += 1

            print
            print "total={} success={} failure={} rate={}".format(
                total, success, failure,
                round((float(failure) / float(total) * 100), 3))
Ejemplo n.º 3
0
    def test_hash(self):
        from pprint import pprint

        a1 = '119 WEST WINTON AVENUE, HAYWARD, CA, 94544'
        a2 = '119 Winton Ave., Hayward, Ca, 94544-5000'

        parser = Parser()
        r = parser.parse(a2)

        pprint(r.dict)
Ejemplo n.º 4
0
    def __init__(self, partition, city_subs=None):
        from address_parser import Parser

        self.p = partition

        self.address_cache = {}

        self.city_subs = {
            k.lower(): v for k,
            v in city_subs.items()} if city_subs else {}

        self.parser = Parser()
Ejemplo n.º 5
0
def chunked_geocode(addresses, state=None, chunk_size=250):

    # Each address entry must be a tuple of (unique_id, address)

    parser = Parser()

    row_n = 0

    request_rows = []

    for uid, address_line in addresses:

        p = parser.parse(address_line)

        rr = [
            uid,
            p.street_str(), p.locality.city, state or p.locality.state,
            p.locality.zip
        ]

        request_rows.append(rr)

        if len(request_rows) > chunk_size:

            for row in make_request(request_rows):
                # row colums are:
                # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips
                yield row_n, True, mkdict(row)
                row_n += 1

            request_rows = []

    for row in make_request(request_rows):
        # row colums are:
        # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips
        yield row_n, True, mkdict(row)
        row_n += 1
Ejemplo n.º 6
0
import pandas as pd
from scrapy.selector import Selector
from bs4 import BeautifulSoup
import re
import time
import requests
import xml.etree.ElementTree as etree
import pyap
from urllib.parse import urlparse
from selenium import webdriver
from shutil import which
from scrapy_selenium import SeleniumRequest
from address_parser import Parser
import requests

address_parser = Parser()

#yield SeleniumRequest(url=url, callback=self.parse_result)
"""SELENIUM_DRIVER_NAME = 'chrome'
driver_path = which('/home/val/coding/chromedriver')
SELENIUM_DRIVER_ARGUMENTS=['-headless']"""

options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(executable_path='/home/val/coding/chromedriver',
                          options=options)


def fill_data_base(frame):
    # Automatic filling the data base with scraped information
Ejemplo n.º 7
0
from address_parser import Parser
import re


addresss = '387 View Ave apt4 Twin Falls, ID 83301'

info = 'Email:  [email protected] Phone:  +1 (956) 8574114'





email = re.findall('\S+@\S+', info) 

ok = re.findall('\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}', info)

parser = Parser()
adr = parser.parse(addresss)

print(f'{adr.number.number} {adr.road.direction} {adr.road.name} {adr.road.suffix}')
print(f'{adr.text}')
print(f'{email[0]}')
print(f'{ok[0]}')
    def get_pubmed_data(self,
                        query,
                        searched_zipcode,
                        date,
                        maximum_number_of_value=3):
        csv_data = {
            "affiliation": [],
            "number_of_authors": [],
            "authors_name": [],
            "authors_institute": [],
            "authors_address": [],
            "authors_zipcode": [],
            "paper_title": [],
            "publication_date": [],
            "journal": []
        }
        pubmed = PubMed(tool="MyTool", email="*****@*****.**")
        parser = Parser()

        results = pubmed.query(query, max_results=maximum_number_of_value)
        is_queried_by_zipcode = searched_zipcode.isdecimal()

        if is_queried_by_zipcode:
            searched_zipcode = int(searched_zipcode)

        for article in results:
            jsonData = json.loads(article.toJSON())
            authors_list = jsonData['authors']
            authors_name = ""
            authors_institute = ""
            authors_affiliation = ""
            authors_address = ""
            authors_zipcode = ""
            num_authors = len(authors_list) or 0
            counted_matched = 0
            if is_queried_by_zipcode:
                counted_matched = self.has_match_zipcode_of_authprs(
                    authors_list, searched_zipcode)
            if (not is_queried_by_zipcode) or (is_queried_by_zipcode
                                               and counted_matched > 0):
                for index in range(0, num_authors):
                    affiliation = authors_list[index][
                        "affiliation"] or "<NOT_AVAILABLE>"
                    zipcode = str(self.get_address_with_zipcode(affiliation))
                    # print(type(zipcode))
                    # print(zipcode)
                    author_name = authors_list[index][
                        'firstname'] + " " + authors_list[index][
                            "lastname"] or "<NOT_AVAILABLE>"
                    author_institute = ""
                    author_institute += self.get_organization(
                        affiliation=affiliation) + " "
                    authors_affiliation += affiliation
                    authors_name += author_name
                    authors_institute += author_institute
                    authors_address += str(parser.parse(affiliation))
                    authors_zipcode += zipcode
                    if num_authors != index + 1:
                        authors_name += "||"
                        authors_institute += "||"
                        authors_affiliation += "||"
                        authors_address += "||"
                        authors_zipcode += "||"
            else:
                break
            paper_title = jsonData['title'] or "<NOT_AVAILABLE>"
            publication_date = jsonData['publication_date'] or "<NOT_AVAILABLE>"
            journal = jsonData['journal'] or "<NOT_AVAILABLE>"

            if self.is_us:
                if not is_queried_by_zipcode or (is_queried_by_zipcode
                                                 and counted_matched > 0):

                    csv_data["authors_name"].append(authors_name)
                    csv_data["affiliation"].append(authors_affiliation)
                    csv_data["authors_institute"].append(authors_institute)
                    csv_data["paper_title"].append(paper_title)
                    csv_data["publication_date"].append(publication_date)
                    csv_data["journal"].append(journal)
                    csv_data["authors_address"].append(authors_address)
                    csv_data["number_of_authors"].append(num_authors)
                    csv_data["authors_zipcode"].append(authors_zipcode)
                    self.is_us = False

            # if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0):
            #
            #     df = pd.DataFrame(csv_data)
            #     # print(df.head())
            #     df.to_csv("PubMedData_from.csv", index=False)

        print("Size of csv ", len(csv_data["paper_title"]))
        if len(csv_data["paper_title"]) > 0:
            df = pd.DataFrame(csv_data)
            print(df.head())
            datetimeobject = datetime.datetime.strptime(date, '%Y/%m/%d')
            csv_file_name = "PubMedData_From_" + datetimeobject.strftime(
                '%Y_%m_%d') + ".csv"
            print(csv_file_name)
            df.to_csv(csv_file_name, index=False)
Ejemplo n.º 9
0
    def test_address_files(self):
        import os           

        import csv

        parser = Parser()
    
        success = 0
        failure = 0
        total = 0

        for filename in ["crime_addresses"]:
            f_input =  os.path.join(os.path.dirname(__file__), 'support',filename + '.txt')
            f_output =  os.path.join(os.path.dirname(__file__), 'support',filename + '.out.csv')
            with open(f_output, 'w') as out:
                writer = csv.DictWriter(out, self.header)
                writer.writeheader()
                with open(f_input) as f:
                    for line in f:
             
                        total += 1
             
            
                        try: 
                            ps = parser.parse(line)
                            if not ps:
                                failure += 1
                                continue
                        except TypeError:
                            raise
                        except Exception as e:
                            print("ERROR", e)

                            failure += 1
                            continue

                        d = ps.dict
                        d['input'] = line.strip()
                        d['output'] = str(ps)
                    
                        d2 = dict(d.items())
                        del d2['hash']
                        del d2['locality']
                        del d2['text']
                        del d2['road']
                        writer.writerow(d2)
                 
                        # THe parser strips 'BLOCK', and '/' is an intersection
                        if line.strip() != str(ps) and 'block' not in line.lower() and '/' not in line:
                            failure += 1
                            print('-----')
                            print(line.strip())
                            print(ps)

                            print()
                        else:
 
                            success += 1
                
            print ()
            print ("total={} success={} failure={} rate={}".format(total, success, failure, round((float(failure)/float(total)*100), 3)))