Beispiel #1
0
def fetch_emails():
  html = cache_selenium(EMAIL_LINK_URL)
  soup = BeautifulSoup(html, 'html.parser')
  xlsx_url = soup('a', text=re.compile(r'county.*e-?mail', re.IGNORECASE))[0]['href']
  xlsx = cache_request(xlsx_url, is_binary=True)
  emails = pd.read_excel(xlsx).fillna(method='ffill').apply(lambda x: x.str.strip())
  emails = emails.rename(columns={'Email': 'emails'})
  emails['locale'] = emails['County'].str.title() + ' County'
  return emails.groupby('locale')['emails'].apply(list)
Beispiel #2
0
def fetch_data(verbose=True):
    driver = init_selenium_driver()  # will be using repeatedly
    html = cache_selenium(BASE_URL, driver=driver)
    soup = BeautifulSoup(html, 'html.parser')
    county_links = soup.select('a[href^=countyInfo]')
    assert len(county_links) > 0, (
        'No county links found in the following HTML:\n' + '#' * 30 + html +
        '#' * 30)
    data = [
        fetch_and_parse_county(BASE_URL + county_link['href'], driver)
        for county_link in tqdm(county_links, disable=not verbose)
    ]
    driver.close()
    return data
Beispiel #3
0
def fetch_and_parse_county(county_url, driver):
    html = cache_selenium(county_url, wait=1, driver=driver)
    soup = BeautifulSoup(html, 'html.parser')
    county = soup.find('p', class_='title').text.split('Supervisor')[0].strip()
    links = soup.find(id='rightContent')('a')
    return {
        'locale':
        county,
        'official':
        soup.find('span',
                  class_='bigRed').text.replace(u'\xa0',
                                                ' ').split(',')[0].strip(),
        'emails': [links[0]['href'].replace('mailto:', '').strip()],
        'url':
        links[1]['href'].strip(),
        'county':
        county,
    }
Beispiel #4
0
def fetch_data():
    html = cache_selenium(BASE_URL)
    data = parse_html(html)
    return data
Beispiel #5
0
def main():
  html = cache_selenium(BASE_URL)
  data = parse_html(html)
  data = normalize_state(data)
  diff_and_save(data, 'public/massachusetts.json')
def fetch_data(verbose=True):  # pylint: disable=unused-argument
    html = cache_selenium(BASE_URL)
    data = parse_html(html)
    return data