address_re = re.compile(".+?[A-Z]{2,2} \d{5}[\d-]*") for county_title in county_list: authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state) county_name = county_title file_path = tmpdir + county_name + "-mo-clerks.html" county = open(file_path).read().replace(" "," ").replace("Post Office","P.O.").replace("Ste Genevieve","Ste. Genevieve") county = county.replace("\n","").replace("\r","") for item in space_re.findall(county_name): if "Mc" not in county_name: county_name = dogcatcher.insert(county_name," ",county_name.find(item)+1) if "De Kalb" in county_name: county_name = "DeKalb" authority_name = authority_name_re.findall(county)[0] county = county.replace(authority_name,"")#Removing as much as possible from the data makes it easier to find the address later. office_name = office_name.replace(county_name,"").replace("County","").strip() website = dogcatcher.website_find(website_re, county) phone = dogcatcher.phone_find(phone_re, county) fax = dogcatcher.phone_find(fax_re, county) for item in html_re.findall(county): #Removing as much as possible from the data makes it easier to find the address later. county = county.replace(item,", ").strip(", ")
output.write(reg_data) output.close() data = open(file_path).read() reg_data = open(reg_file_path).read() # fix some issues in the data # data = data.replace("550 E. 2nd Ave Belton 76513", "550 E. 2nd Ave, Belton 76513") data = dogcatcher.po_standardize(data.replace(""","'").replace("&","&").replace(", TX","")) reg_data = dogcatcher.po_standardize(reg_data.replace(""","'").replace(", TX","")) no_space_re = re.compile(",[^\s]") for item in no_space_re.findall(data): data = data.replace(item, dogcatcher.insert(item, " ", 1)) for item in no_space_re.findall(reg_data): data = data.replace(item, dogcatcher.insert(item, " ", 1)) county_re = re.compile("<dl>\s*(<dt>.+?</dd>)\s*</dl>", re.DOTALL) county_data_item_re = re.compile("dd>([^\n\r]+?\s*[^\n<]*?)\s*<",re.DOTALL) reg_county_data_item_re = re.compile("dd>(.+?)\s*<", re.DOTALL) county_name_re = re.compile("<..>([^<>]+?)</dt>") name_re = re.compile("[^\d]+?") middle_name_re = re.compile(" ([a-zA-z]\. )") phone_re = re.compile(">(\(\d{3}\) \d{3}-\d{4}.*?)<") reg_phone_re = re.compile(">(\(\d{3}\) \d{3}-\d{4}[/ext\.\d ]*).*?<", re.DOTALL)