zip_re = re.compile("\d{5}[^\s]*") if "300 S. Garnett St Henderson" in data: data = data.replace("300 S. Garnett St Henderson", "300 S. Garnett St <br /> Henderson") else: print "This is no longer a useful piece of code. Remove it." sys.exit() county_data = county_data_re.findall(data) print "County number ", county_data.length for county in county_data: authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin( voter_state) county_data_item = county_data_item_re.findall(county) county_name = county_name_re.findall(county)[0] official_name = name_re.findall(county)[0].replace("\r\n", "").lstrip() first_name, last_name, review = dogcatcher.split_name( official_name, review) #NC gives two addresses: a mailing address and a street, each formatted "Streeet <br /> City, State Zip". The mailing address may be identical to thes treet address. #This gets the address by running a RE to grab each and split it at the "<br />". #It then checks whether the mailing and non-mailing addresses are identical. If not po_address = " ".join(county_data_item[1].replace( "\r\n", "").split()).partition("<br />")
address_re = re.compile("Board of Elections<BR>(.+?\d{5}[-\d]*?)<br>") csz_re = re.compile("<br>([^<>]+?, [A-Z]{2,2} +?\d{5}[\d-]*)") city_re = re.compile("(.+?),") state_re = re.compile(" [A-Z][A-Z] ") zip_re = re.compile("\d{5}[\d-]*") po_re = re.compile("(P\.\s*O\..+?)<br>") comma_re = re.compile("[, ]{2,}") website_re = re.compile("HREF=\"([^m].+?)\">Visit") #This reduces the web page grabbed earlier to a simple list of county names. For each county name, we then turn it into a URL, grab an associated county webpage, extract the data, add that data to the Results matrix, and move on to the next county name. county_names = county_name_re.findall(data) for item in county_names: authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state) county_name = item authority_name = "Board of Elections" #We need to define a slightly different version of the county name to be used in the URL; there's only a distinction in two counties, St. Lawrence and New York. county_name_use = county_name if county_name == "St": county_name = "St. Lawrence" county_name_use = "St.Lawrence" if county_name == "New": county_name = "New York" county_name_use = "New+York"