def main(): with open( dir_path(__file__) + '/results/WI Municipal Clerks no emails Updated 3-23-2020.pdf', 'rb') as fh: pdf_reader = PyPDF2.PdfFileReader(fh) # full_text = [pdf_reader.getPage(page).extractText() for page in range(pdf_reader.numPages)] records = [] for page_num in tqdm(range(pdf_reader.numPages)): text = pdf_reader.getPage(page_num).extractText() for city_lines in chunk_city(text): records += [parse_city_lines(city_lines)] with open(dir_path(__file__) + '/results/records.noemail.json', 'w') as fh: json.dump(records, fh)
def main(): with open(dir_path(__file__) + '/results/records.noemail.json') as fh: records = json.load(fh) for record in tqdm(records[:None]): key = record['key'] if record['municipal_address']: save_data( record['municipal_address'], dir_path(__file__) + f'/results/municipal_address/{key}.json', ) if record['mailing_address']: save_data( record['mailing_address'], dir_path(__file__) + f'/results/mailing_address/{key}.json', )
def get_arcgis_fipscode(): # The following file is the Michigan ARCGIS data and unlikely to change with time # https://gis-michigan.opendata.arcgis.com/datasets/minor-civil-divisions-cities-townships-v17a/data?geometry=-167.515%2C43.121%2C169.985%2C84.204 with open( os.path.join(dir_path(__file__), 'data', 'Minor_Civil_Divisions.csv')) as csv_file: rows = list(csv.reader(csv_file)) header = rows[0] rows = rows[1:] label_idx = header.index('FIPSCODE') return sorted([row[label_idx] for row in rows])
def get_arcgis_code(): # The following file is the Wisconsin ARCGIS data and unlikely to change with time # https://data-ltsb.opendata.arcgis.com/datasets/wi-cities-towns-and-villages-july-2020 with open( os.path.join( dir_path(__file__), 'data', 'WI_Cities_Towns_and_Villages__July_2020.csv')) as csv_file: rows = list(csv.reader(csv_file)) header = rows[0] rows = rows[1:] label_idx = header.index('DOA') return sorted([row[label_idx] for row in rows])
def main(): df_noemail = pd.read_json( dir_path(__file__) + '/results/records.noemail.json') df_noemail.sample(5, random_state=42) df_mail = pd.DataFrame(read_mailing_address()).dropna(axis=0, how='all') df_muni = pd.DataFrame(read_municipal_address()).dropna(axis=0, how='all') df_mail['key'] = df_mail['jurisdictionName'].str.upper() df_muni['key'] = df_muni['jurisdictionName'].str.upper() df_merged = df_muni.merge(df_mail, on='key', how='inner') df_fetched = pd.concat([ df_muni.set_index('key'), df_mail.set_index('key'), ]).drop_duplicates() df_fetched.sample(n=5, random_state=42) df_noemail2 = df_noemail.set_index('key') fix_cols = [ 'city_type', 'city', 'county', 'clerk', 'deputy_clerk', 'municipal_address', 'mailing_address' ] for col in fix_cols: df_noemail2[col] = df_noemail2[col].str.title() df_master = df_noemail2.merge(df_fetched, on='key', how='left').reset_index() df_master['title_key'] = df_master['key'].str.title() def merge_all_cols(df, pairs): return pd.DataFrame({ first: df[first].where(df[first].notnull(), df[second]) for first, second in pairs }) df_final = pd.concat([ merge_all_cols(df_master, [['muncipalAddress', 'municipal_address'], ['mailingAddress', 'mailing_address'], ['clerkName', 'clerk'], ['fax_y', 'fax_x'], ['jurisdictionName', 'title_key']]), df_master[['email', 'notificationEmail', 'county']], ], axis=1).rename( { 'fax_y': 'fax', 'city_type': 'cityType' }, axis=1) df_final['city'] = df_master['city_type'] + ' of ' + df_master['city'] df_output = df_final[[ 'muncipalAddress', 'mailingAddress', 'clerkName', 'jurisdictionName', 'county', 'city' ]].rename( { 'muncipalAddress': 'physicalAddress', 'mailingAddress': 'address', 'clerkName': 'official', 'jurisdictionName': 'locale', }, axis=1) def to_list(df): return df.apply(lambda row: list( set(email.strip() for cell in row if pd.notnull(cell) if cell for email in re.split(';|,', cell) if email)), axis=1) df_output['county'] = df_output['county'].replace('Multiple Counties', np.nan).str.strip() df_output['locale'] = df_output['locale'].str.replace(' - ', ':').str.replace( ' Of ', ' of ') df_output['locale'] = df_output['locale'].str.replace( 'Multiple Counties', '').str.strip() df_output['city'] = df_output['city'].str.strip() df_output['faxes'] = to_list(df_final[['fax']]) df_output['emails'] = to_list(df_final[['email', 'notificationEmail']]) df_output.to_json('public/wisconsin.json', orient='records')
def read_mailing_address(add_lookup_key=False): return _read_address( dir_path(__file__) + '/results/mailing_address/*.json', add_lookup_key)
import json import glob import os from common import dir_path files = glob.glob(dir_path(__file__) + '/cache/*.json') data = [] print(f'Found {len(files)} files') for file in files: with open(file) as fh: datum = json.load(fh) county = datum['title'].split('Supervisor')[0].strip() data += [{ 'locale': county, 'official': datum['name'].replace(u'\xa0', ' ').split(',')[0].strip(), 'emails': [datum['email'].split(':')[1].strip()], # ignore leading 'mailto:' 'url': datum['url'], 'county': county, }] with open('public/florida.json', 'w') as fh: json.dump(data, fh)
from glob import glob import json import re from bs4 import BeautifulSoup from bs4.element import NavigableString from common import dir_path _dir = dir_path(__file__) files = glob(f'{_dir}/cache/*.html') def parse_addr_line(key, line): if not isinstance(line, NavigableString): return {} else: return {key: line.strip().title().replace(' Ga ', ' GA ')} def parse_contact_line(line): if not isinstance(line, NavigableString): return {} parsed = line.split(':') if len(parsed) == 2: k, v = parsed return {k.strip(): v.strip()} return {} def parse_contact(h4):
from common import to_list import json from common import dir_path def filter_dict_by_key(d, keys): keys = set(keys) return {k: v for k, v in d.items() if k in keys} output = [] with open(dir_path(__file__) + '/cache/data.jl') as fh: for line in fh: data = json.loads(line) if data['type'] != 'local': continue value = filter_dict_by_key(data, {'clerk', 'email', 'phone', 'fax'}) value['county'] = data['CountyName'] value['city'] = data['jurisdictionName'] # rename Twp to Township if value['city'].endswith('Twp'): value['city'] = value['city'][:-3] + 'Township' county = value['county'].title().strip()
from urllib.parse import urlparse, parse_qs import re import random import unicodedata import os from tqdm import tqdm from bs4 import BeautifulSoup from common import dir_path, cache_request BASE_URL = "https://mvic.sos.state.mi.us/Clerk" # resolved issue with SSL cert chain by fixing intermediate cert # base64 root-intermediate-site certs saved from Chrome, converted to pem using openssl, # concatenated into mich_chain.pem SSL_CERT = os.path.join(dir_path(__file__), 'michigan_chain.pem') re_official = re.compile(r'^\s*(.*?)\s*[,\n]') re_address = re.compile(r'\n(.*)\nPhone', flags=re.MULTILINE + re.DOTALL) re_phone = re.compile(r'\nPhone:[^\n\S]*(.+?)\s*\n') re_fax = re.compile(r'Fax:[^\n\S]*(.+?)\s*\n') def random_wait(min_wait=.1, max_wait=.3): return random.uniform(min_wait, max_wait) def parse_jurisdiction(soup, jurisdiction_name, county_name, fipscode): city = re.sub(r'\s+Twp', ' Township', jurisdiction_name) county = county_name.title().strip() body = soup.find('div', class_='card-body') info = re.sub(r'\s*\n\s*', '\n',
return { **init, 'locale': locale, 'official': official, 'address': ', '.join(address), 'emails': list(set(emails)), 'phones': [phone], 'faxes': [fax], 'url': url, } if __name__ == '__main__': # Actually this file: https://www.nvsos.gov/sos/elections/voters/county-clerk-contact-information # But it's behind a javascript test with open(dir_path(__file__) + '/cache/Nevada.htm') as fh: page = fh.read() soup = BeautifulSoup(page, 'lxml') ps = soup.select('div.content_area > p') iter_ = ElemIterator([x for p in ps for x in p.children]) raw_counties = [parse_county(county) for county in parse_lines(iter_)] merge_counties = {} for county in raw_counties: locale = county['locale'] if locale in merge_counties: merge_counties[locale]['emails'] += county['emails'] else: merge_counties[locale] = county counties = list(merge_counties.values())