Beispiel #1
0
address_re = re.compile(".+?[A-Z]{2,2} \d{5}[\d-]*")


for county_title in county_list:


	authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state)

	county_name = county_title

	file_path = tmpdir + county_name + "-mo-clerks.html"
	county = open(file_path).read().replace(" "," ").replace("Post Office","P.O.").replace("Ste Genevieve","Ste. Genevieve")
	county = county.replace("\n","").replace("\r","")
	for item in space_re.findall(county_name):
		if "Mc" not in county_name:
			county_name = dogcatcher.insert(county_name," ",county_name.find(item)+1)
		if "De Kalb" in county_name:
			county_name = "DeKalb"

	authority_name = authority_name_re.findall(county)[0]

	county = county.replace(authority_name,"")#Removing as much as possible from the data makes it easier to find the address later.

	office_name = office_name.replace(county_name,"").replace("County","").strip()

	website = dogcatcher.website_find(website_re, county)
	phone = dogcatcher.phone_find(phone_re, county)
	fax = dogcatcher.phone_find(fax_re, county)

	for item in html_re.findall(county): #Removing as much as possible from the data makes it easier to find the address later.
		county = county.replace(item,", ").strip(", ")
Beispiel #2
0
output.write(reg_data)
output.close()

data = open(file_path).read()
reg_data = open(reg_file_path).read()

# fix some issues in the data
# data = data.replace("550 E. 2nd Ave Belton 76513", "550 E. 2nd Ave, Belton 76513")

data = dogcatcher.po_standardize(data.replace(""","'").replace("&","&").replace(", TX",""))
reg_data = dogcatcher.po_standardize(reg_data.replace(""","'").replace(", TX",""))

no_space_re = re.compile(",[^\s]")

for item in no_space_re.findall(data):
	data = data.replace(item, dogcatcher.insert(item, " ", 1))

for item in no_space_re.findall(reg_data):
	data = data.replace(item, dogcatcher.insert(item, " ", 1))


county_re = re.compile("<dl>\s*(<dt>.+?</dd>)\s*</dl>", re.DOTALL)
county_data_item_re = re.compile("dd>([^\n\r]+?\s*[^\n<]*?)\s*<",re.DOTALL)
reg_county_data_item_re = re.compile("dd>(.+?)\s*<", re.DOTALL)
county_name_re = re.compile("<..>([^<>]+?)</dt>")

name_re = re.compile("[^\d]+?")
middle_name_re = re.compile(" ([a-zA-z]\. )")

phone_re = re.compile(">(\(\d{3}\) \d{3}-\d{4}.*?)<")
reg_phone_re = re.compile(">(\(\d{3}\) \d{3}-\d{4}[/ext\.\d ]*).*?<", re.DOTALL)