Ejemplo n.º 1

req_2 = urllib2.Request(url_2, headers=headers)
pdf_2 = urllib2.urlopen(req_2).read()

data_2 = dogcatcher.pdf_to_text(pdf_2)
output = open(file_path_2, "w")

absdata = open(file_path_1).read()
regdata = open(file_path_2).read()

#Check to make sure that W I doesn't appear in the source documents before running.
absdata = dogcatcher.po_standardize(absdata.replace("W I","WI").replace("","").replace("ONE FIRST","1 FIRST"))
regdata = dogcatcher.po_standardize(regdata.replace("W I","WI").replace("","").replace("ONE FIRST","1 FIRST"))
absdata = absdata.replace("\nN. ","\nNorth ")
regdata = regdata.replace("N. S","North S")

header_re = re.compile(".+?\d{2}:\d{2}:\d{2} [AP]M", re.DOTALL)

for item in header_re.findall(absdata):
    absdata = absdata.replace(item,"")

abstown_re = re.compile("([A-Z][A-Z].+?TOWN CLERK.+?)\n\n", re.DOTALL)
regtown_re = re.compile("REGISTRAR[S]* OF .+?CT\s*\d{5}[-\d]*\n\n", re.DOTALL)
regtown_name_re = re.compile("REGIS.+?, (.+)")
abstown_name_re = re.compile("(.+) TOWN CLERK")
party_re = re.compile(" [\[\(].+?[\)\]]")
Ejemplo n.º 2
state_re = re.compile(" ([A-Z][A-Z]) ")
csz_re = re.compile("[^,\t\n]+?, [A-Z][A-Z] \d{5}[\d-]*")
city_re = re.compile("(.+?),")
zip_re = re.compile("\d{5}[\d-]*")
is_street_re = re.compile("[^,\. \n\t]")
street_break_re = re.compile(" *,* *\n")
multi_comma_re = re.compile(", *, *")
multi_space_re = re.compile("  +")

data = data.replace("- ","-")
data = data.replace(" and<br>\n",", ")
#fixing an edge case in Morris County
data = data.replace("-4:30pm","-4:30pm<br")
#fixing an edge case in Mercer County
data = data.replace("(FAX) 609-989-6888<br>\nOffice Hours: 8:00am-4:00pm","(FAX) 609-989-6888<br>\nOffice Hours: 8:00am-4:00pm<br>")
data = dogcatcher.po_standardize(data)

county_data = county_data_re.findall(data)

#In each county, there are separate offices for registration and absentee ballots. This separates those offices and then applies essentially identical procedures to both.
for county in county_data:
	authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state)

	county_name = county_name_re.findall(county)[0]

	#This isolates the county clerk data from the complete county.
	clerk = clerk_re.findall(county)[0]

	clerk_name = name_re.findall(clerk)[0]
	first_name, last_name, review = dogcatcher.split_name(clerk_name, review)
Ejemplo n.º 3
csz_re = re.compile("(<p>[^\d]+?, *[A-Z][A-Z] *\d{5}[\d-]*</p>)")
city_re = re.compile("<p>(.+?),")
state_re = re.compile(" [A-Z][A-Z] ")
zip_re = re.compile("\d{5}[-\d]*")
address_re = re.compile("</p>.+?<p>(.+? \d{5}[\d-]*</p>)", re.DOTALL)
po_re = re.compile("(P.* *O.* .+?)</p>")

phone_re = re.compile("Phone: (.+?)")
fax_re = re.compile("Fax: (.+?)")

town_name_re = re.compile("(.+?)</h2>")

data = open(file_path).read()
data = data.replace("<p style=\"clear:both;\">Last Updated:","</div><div style=\"clear:left;\">")
data = dogcatcher.po_standardize(data)

county_data = county_re.findall(data)

for county in county_data:

	authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state)

	authority_name = "Municipal Clerk"
	print authority_name

	#There are many edits to town names needed to make the data come off of the Google Maps API well.
	town_name = town_name_re.findall(county)[0].replace("Plt","Plantation").strip(".")

	if town_name == "Rockwood Strip":
		town_name = town_name.replace(" Strip","")
Ejemplo n.º 4

reg_url = "http://www.sos.state.tx.us/elections/voter/votregduties.shtml"
reg_data = urllib.urlopen(reg_url).read()
output = open(reg_file_path,"w")

data = open(file_path).read()
reg_data = open(reg_file_path).read()

# fix some issues in the data
# data = data.replace("550 E. 2nd Ave Belton 76513", "550 E. 2nd Ave, Belton 76513")

data = dogcatcher.po_standardize(data.replace("&quot;","'").replace("&amp;","&").replace(", TX",""))
reg_data = dogcatcher.po_standardize(reg_data.replace("&quot;","'").replace(", TX",""))

no_space_re = re.compile(",[^\s]")

for item in no_space_re.findall(data):
	data = data.replace(item, dogcatcher.insert(item, " ", 1))

for item in no_space_re.findall(reg_data):
	data = data.replace(item, dogcatcher.insert(item, " ", 1))

county_re = re.compile("<dl>\s*(<dt>.+?</dd>)\s*</dl>", re.DOTALL)
county_data_item_re = re.compile("dd>([^\n\r]+?\s*[^\n<]*?)\s*<",re.DOTALL)
reg_county_data_item_re = re.compile("dd>(.+?)\s*<", re.DOTALL)
county_name_re = re.compile("<..>([^<>]+?)</dt>")