Esempio n. 1
0
def main():
    with open(
            dir_path(__file__) +
            '/results/WI Municipal Clerks no emails Updated 3-23-2020.pdf',
            'rb') as fh:
        pdf_reader = PyPDF2.PdfFileReader(fh)
        # full_text = [pdf_reader.getPage(page).extractText() for page in range(pdf_reader.numPages)]
        records = []
        for page_num in tqdm(range(pdf_reader.numPages)):
            text = pdf_reader.getPage(page_num).extractText()
            for city_lines in chunk_city(text):
                records += [parse_city_lines(city_lines)]

    with open(dir_path(__file__) + '/results/records.noemail.json', 'w') as fh:
        json.dump(records, fh)
Esempio n. 2
0
def main():
    with open(dir_path(__file__) + '/results/records.noemail.json') as fh:
        records = json.load(fh)

    for record in tqdm(records[:None]):
        key = record['key']
        if record['municipal_address']:
            save_data(
                record['municipal_address'],
                dir_path(__file__) + f'/results/municipal_address/{key}.json',
            )

        if record['mailing_address']:
            save_data(
                record['mailing_address'],
                dir_path(__file__) + f'/results/mailing_address/{key}.json',
            )
Esempio n. 3
0
def get_arcgis_fipscode():
    # The following file is the Michigan ARCGIS data and unlikely to change with time
    # https://gis-michigan.opendata.arcgis.com/datasets/minor-civil-divisions-cities-townships-v17a/data?geometry=-167.515%2C43.121%2C169.985%2C84.204
    with open(
            os.path.join(dir_path(__file__), 'data',
                         'Minor_Civil_Divisions.csv')) as csv_file:
        rows = list(csv.reader(csv_file))
        header = rows[0]
        rows = rows[1:]
        label_idx = header.index('FIPSCODE')
        return sorted([row[label_idx] for row in rows])
Esempio n. 4
0
def get_arcgis_code():
    # The following file is the Wisconsin ARCGIS data and unlikely to change with time
    # https://data-ltsb.opendata.arcgis.com/datasets/wi-cities-towns-and-villages-july-2020
    with open(
            os.path.join(
                dir_path(__file__), 'data',
                'WI_Cities_Towns_and_Villages__July_2020.csv')) as csv_file:
        rows = list(csv.reader(csv_file))
        header = rows[0]
        rows = rows[1:]
        label_idx = header.index('DOA')
        return sorted([row[label_idx] for row in rows])
Esempio n. 5
0
def main():
    df_noemail = pd.read_json(
        dir_path(__file__) + '/results/records.noemail.json')
    df_noemail.sample(5, random_state=42)

    df_mail = pd.DataFrame(read_mailing_address()).dropna(axis=0, how='all')
    df_muni = pd.DataFrame(read_municipal_address()).dropna(axis=0, how='all')

    df_mail['key'] = df_mail['jurisdictionName'].str.upper()
    df_muni['key'] = df_muni['jurisdictionName'].str.upper()

    df_merged = df_muni.merge(df_mail, on='key', how='inner')

    df_fetched = pd.concat([
        df_muni.set_index('key'),
        df_mail.set_index('key'),
    ]).drop_duplicates()

    df_fetched.sample(n=5, random_state=42)

    df_noemail2 = df_noemail.set_index('key')
    fix_cols = [
        'city_type', 'city', 'county', 'clerk', 'deputy_clerk',
        'municipal_address', 'mailing_address'
    ]
    for col in fix_cols:
        df_noemail2[col] = df_noemail2[col].str.title()

    df_master = df_noemail2.merge(df_fetched, on='key',
                                  how='left').reset_index()
    df_master['title_key'] = df_master['key'].str.title()

    def merge_all_cols(df, pairs):
        return pd.DataFrame({
            first: df[first].where(df[first].notnull(), df[second])
            for first, second in pairs
        })

    df_final = pd.concat([
        merge_all_cols(df_master, [['muncipalAddress', 'municipal_address'],
                                   ['mailingAddress', 'mailing_address'],
                                   ['clerkName', 'clerk'], ['fax_y', 'fax_x'],
                                   ['jurisdictionName', 'title_key']]),
        df_master[['email', 'notificationEmail', 'county']],
    ],
                         axis=1).rename(
                             {
                                 'fax_y': 'fax',
                                 'city_type': 'cityType'
                             }, axis=1)

    df_final['city'] = df_master['city_type'] + ' of ' + df_master['city']

    df_output = df_final[[
        'muncipalAddress', 'mailingAddress', 'clerkName', 'jurisdictionName',
        'county', 'city'
    ]].rename(
        {
            'muncipalAddress': 'physicalAddress',
            'mailingAddress': 'address',
            'clerkName': 'official',
            'jurisdictionName': 'locale',
        },
        axis=1)

    def to_list(df):
        return df.apply(lambda row: list(
            set(email.strip() for cell in row if pd.notnull(cell) if cell
                for email in re.split(';|,', cell) if email)),
                        axis=1)

    df_output['county'] = df_output['county'].replace('Multiple Counties',
                                                      np.nan).str.strip()
    df_output['locale'] = df_output['locale'].str.replace(' - ',
                                                          ':').str.replace(
                                                              ' Of ', ' of ')
    df_output['locale'] = df_output['locale'].str.replace(
        'Multiple Counties', '').str.strip()
    df_output['city'] = df_output['city'].str.strip()

    df_output['faxes'] = to_list(df_final[['fax']])
    df_output['emails'] = to_list(df_final[['email', 'notificationEmail']])

    df_output.to_json('public/wisconsin.json', orient='records')
Esempio n. 6
0
def read_mailing_address(add_lookup_key=False):
    return _read_address(
        dir_path(__file__) + '/results/mailing_address/*.json', add_lookup_key)
Esempio n. 7
0
import json
import glob
import os

from common import dir_path

files = glob.glob(dir_path(__file__) + '/cache/*.json')
data = []
print(f'Found {len(files)} files')
for file in files:
    with open(file) as fh:
        datum = json.load(fh)
        county = datum['title'].split('Supervisor')[0].strip()
        data += [{
            'locale':
            county,
            'official':
            datum['name'].replace(u'\xa0', ' ').split(',')[0].strip(),
            'emails':
            [datum['email'].split(':')[1].strip()],  # ignore leading 'mailto:'
            'url':
            datum['url'],
            'county':
            county,
        }]

with open('public/florida.json', 'w') as fh:
    json.dump(data, fh)
Esempio n. 8
0
from glob import glob
import json
import re
from bs4 import BeautifulSoup
from bs4.element import NavigableString

from common import dir_path

_dir = dir_path(__file__)

files = glob(f'{_dir}/cache/*.html')


def parse_addr_line(key, line):
    if not isinstance(line, NavigableString):
        return {}
    else:
        return {key: line.strip().title().replace(' Ga ', ' GA ')}


def parse_contact_line(line):
    if not isinstance(line, NavigableString):
        return {}
    parsed = line.split(':')
    if len(parsed) == 2:
        k, v = parsed
        return {k.strip(): v.strip()}
    return {}


def parse_contact(h4):
Esempio n. 9
0
from common import to_list
import json

from common import dir_path


def filter_dict_by_key(d, keys):
    keys = set(keys)
    return {k: v for k, v in d.items() if k in keys}


output = []

with open(dir_path(__file__) + '/cache/data.jl') as fh:
    for line in fh:
        data = json.loads(line)

        if data['type'] != 'local':
            continue

        value = filter_dict_by_key(data, {'clerk', 'email', 'phone', 'fax'})

        value['county'] = data['CountyName']
        value['city'] = data['jurisdictionName']

        # rename Twp to Township
        if value['city'].endswith('Twp'):
            value['city'] = value['city'][:-3] + 'Township'

        county = value['county'].title().strip()
Esempio n. 10
0
from urllib.parse import urlparse, parse_qs
import re
import random
import unicodedata
import os
from tqdm import tqdm
from bs4 import BeautifulSoup
from common import dir_path, cache_request

BASE_URL = "https://mvic.sos.state.mi.us/Clerk"

# resolved issue with SSL cert chain by fixing intermediate cert
# base64 root-intermediate-site certs saved from Chrome, converted to pem using openssl,
# concatenated into mich_chain.pem
SSL_CERT = os.path.join(dir_path(__file__), 'michigan_chain.pem')

re_official = re.compile(r'^\s*(.*?)\s*[,\n]')
re_address = re.compile(r'\n(.*)\nPhone', flags=re.MULTILINE + re.DOTALL)
re_phone = re.compile(r'\nPhone:[^\n\S]*(.+?)\s*\n')
re_fax = re.compile(r'Fax:[^\n\S]*(.+?)\s*\n')


def random_wait(min_wait=.1, max_wait=.3):
    return random.uniform(min_wait, max_wait)


def parse_jurisdiction(soup, jurisdiction_name, county_name, fipscode):
    city = re.sub(r'\s+Twp', ' Township', jurisdiction_name)
    county = county_name.title().strip()
    body = soup.find('div', class_='card-body')
    info = re.sub(r'\s*\n\s*', '\n',
Esempio n. 11
0
  return {
    **init,
    'locale': locale,
    'official': official,
    'address': ', '.join(address),
    'emails': list(set(emails)),
    'phones': [phone],
    'faxes': [fax],
    'url': url,
  }

if __name__ == '__main__':
  # Actually this file: https://www.nvsos.gov/sos/elections/voters/county-clerk-contact-information
  # But it's behind a javascript test
  with open(dir_path(__file__) + '/cache/Nevada.htm') as fh:
    page = fh.read()
  soup = BeautifulSoup(page, 'lxml')
  ps = soup.select('div.content_area > p')
  iter_ = ElemIterator([x for p in ps for x in p.children])
  raw_counties = [parse_county(county) for county in parse_lines(iter_)]

  merge_counties = {}
  for county in raw_counties:
    locale = county['locale']
    if locale in merge_counties:
      merge_counties[locale]['emails'] += county['emails']
    else:
      merge_counties[locale] = county

  counties = list(merge_counties.values())