combined_csv_path = combine_csvs(os.getenv('INDIVIDUAL_LA_CSV_DIR'),os.getenv('COMBINED_LA_CSV_DIR')) difference_csv_path = create_difference(combined_csv_path) create_zip_from_file(combined_csv_path,os.getenv('FOR_UPLOAD_DIR')) if difference_csv_path: logger.debug("creating difference") logger.debug(combined_csv_path) logger.debug(difference_csv_path) create_zip_from_file(difference_csv_path,os.getenv('FOR_UPLOAD_DIR')) #Now we want a script that uploads any files in 'for_upload' to s3 which aren't already there #Note that because the files are being transfered from EB to S3 this should be super fast - like 100MB/s logger.info("uploading files to azure from {}".format(os.getenv('FOR_UPLOAD_DIR'))) upload_all_new_azure(os.getenv('FOR_UPLOAD_DIR'), azure_container, os.getenv('ACC_NAME'),os.getenv('ACCESS_KEY')) # Finally we want to generate a little webpage to upload that contains all the links logger.info("generating website {}".format(os.getenv('FOR_UPLOAD_DIR'))) generate_website_and_upload_azure(azure_container, "web") logger.info("successfully completed whole script") #Upload log upload_log()
def download_csvs(csv_file_directory): # shutil.rmtree(csv_file_directory) # os.makedirs(csv_file_directory) files = os.listdir(csv_file_directory) files = [f for f in files if ".csv" in f] files = [os.path.join(csv_file_directory, f) for f in files if "__" in f] for f in files: os.remove(f) url = 'http://ratings.food.gov.uk/open-data/en-GB' #Try to download the page at most 10 times. try: r = requests.get(url) except Exception as e: logger.error(e.message[1]) sys.exit() data = r.text #Get html from above url - this is a list of all the xml links soup = BeautifulSoup( data) #parse into dictionary-like structure to extract data #Get a list of all of the hyperlinks of the page that are in English and contain FHRS data. Note re.compile is basically doing a search/filter on the links all_links = soup.find_all("a", text=re.compile('English'), href=re.compile('FHRS')) del r del data del soup logger.debug(str(len(all_links)) + " links were found") #Format: links = [l["href"] for l in all_links] if len(links) < 350: logging.error( "fewer than 350 xml files were found, there was some error") sys.exit() #a now contains a list of all the hyperlinks of xml we want to visit and download #links = [link for link in links if "324" in link] links_to_do = set(links) #this is a list of fields that we want in our final table of data fieldslist = [ "FHRSID", "LocalAuthorityBusinessID", "BusinessName", "BusinessType", "BusinessTypeID", "RatingValue", "RatingKey", "RatingDate", "LocalAuthorityCode", "LocalAuthorityName", "LocalAuthorityWebSite", "LocalAuthorityEmailAddress", "Hygiene", "Structural", "ConfidenceInManagement", "SchemeType", "Longitude", "Latitude", "AddressLine1", "AddressLine2", "AddressLine3", "PostCode", "AddressLine4", "RightToReply", "NewRatingPending" ] #convert to lowercase fieldslist = [x.lower() for x in fieldslist] #finalarr is an array which will contain a list of each row we want in the final dataset import datetime date_string = datetime.date.today().strftime("%Y%m%d") #counter is just so we can keep track of progress, it isn't needed counter_for_done = 0 counter_for_error = 0 all_links_len = len(links_to_do) failed_count_dict = {link: 0 for link in links_to_do} while len(links_to_do) > 0: if counter_for_done % 10 == 0: logger.debug("completed " + str(counter_for_done) + " xml downloads") upload_log() if counter_for_error > all_links_len / 3: logger.error( "Even after retrying the downloads, we were unable to download all the links. Exiting" ) sys.exit() this_link = links_to_do.pop() #download data try: r = requests.get(this_link) except Exception as e: logger.error(e.message[1]) sys.exit() if "Internal Server Error" in r.text: links_to_do.add(this_link) logger.debug("Internal server error on link: " + this_link) continue #parse data try: unicode_text = r.text.encode("latin1").decode("utf-8") except: logger.debug( "Can't convert text reponse from latin1 to unicode on link: " + this_link) continue try: soup = BeautifulSoup(unicode_text) del r except: #If this goes wrong put link back into pile links_to_do.add(this_link) logger.debug("Can't convert to soup on link: " + this_link) continue #find list of establishments try: est = soup.find_all("establishmentdetail") except: links_to_do.add(this_link) logger.debug("Can't find establishmentdetail in link: " + this_link) continue # if len(est) < 1: failed_count_dict[this_link] += 1 if failed_count_dict[this_link] > 3: #Give up on this one counter_for_error += 1 logger.debug( "Can't find any establishmentdetails in link even after 3 attempts: " + this_link) continue else: #Try again links_to_do.add(this_link) continue #for each establishment, find the data in each field and add to dictionary finalarr = [] for i in est: this_dict = {} for j in fieldslist: te = None try: te = i.find(j).text except: pass this_dict[j] = te finalarr.append(this_dict) #add dictionary to array #Check that the csv looks ok: df = pd.DataFrame(finalarr) #Does it have more than one row? if df.shape[0] < 1: links_to_do.add(this_link) logger.debug("Can't find any premesis in link: " + this_link) continue #Now write this to csv file file_name = this_link.replace( r"http://ratings.food.gov.uk/OpenDataFiles/", "").replace("en-GB.xml", "") file_name = os.path.join(csv_file_directory, date_string + "__" + file_name + ".csv") df.to_csv(file_name, encoding="utf-8", index=False) counter_for_done += 1 del df del finalarr gc.collect() for i in failed_count_dict: if failed_count_dict[i] > 3: logger.warning("the file " + i + " contained no establishments") logger.info("completed successfully")
def download_csvs(csv_file_directory): # shutil.rmtree(csv_file_directory) # os.makedirs(csv_file_directory) files = os.listdir(csv_file_directory) files = [f for f in files if ".csv" in f] files = [os.path.join(csv_file_directory, f) for f in files if "__" in f] for f in files: os.remove(f) url = "http://ratings.food.gov.uk/open-data/en-GB" # Try to download the page at most 10 times. try: r = requests.get(url) except Exception as e: logger.error(e.message[1]) sys.exit() data = r.text # Get html from above url - this is a list of all the xml links soup = BeautifulSoup(data) # parse into dictionary-like structure to extract data # Get a list of all of the hyperlinks of the page that are in English and contain FHRS data. Note re.compile is basically doing a search/filter on the links all_links = soup.find_all("a", text=re.compile("English"), href=re.compile("FHRS")) del r del data del soup logger.debug(str(len(all_links)) + " links were found") # Format: links = [l["href"] for l in all_links] if len(links) < 350: logging.error("fewer than 350 xml files were found, there was some error") sys.exit() # a now contains a list of all the hyperlinks of xml we want to visit and download # links = [link for link in links if "324" in link] links_to_do = set(links) # this is a list of fields that we want in our final table of data fieldslist = [ "FHRSID", "LocalAuthorityBusinessID", "BusinessName", "BusinessType", "BusinessTypeID", "RatingValue", "RatingKey", "RatingDate", "LocalAuthorityCode", "LocalAuthorityName", "LocalAuthorityWebSite", "LocalAuthorityEmailAddress", "Hygiene", "Structural", "ConfidenceInManagement", "SchemeType", "Longitude", "Latitude", "AddressLine1", "AddressLine2", "AddressLine3", "PostCode", "AddressLine4", "RightToReply", "NewRatingPending", ] # convert to lowercase fieldslist = [x.lower() for x in fieldslist] # finalarr is an array which will contain a list of each row we want in the final dataset import datetime date_string = datetime.date.today().strftime("%Y%m%d") # counter is just so we can keep track of progress, it isn't needed counter_for_done = 0 counter_for_error = 0 all_links_len = len(links_to_do) failed_count_dict = {link: 0 for link in links_to_do} while len(links_to_do) > 0: if counter_for_done % 10 == 0: logger.debug("completed " + str(counter_for_done) + " xml downloads") upload_log() if counter_for_error > all_links_len / 3: logger.error("Even after retrying the downloads, we were unable to download all the links. Exiting") sys.exit() this_link = links_to_do.pop() # download data try: r = requests.get(this_link) except Exception as e: logger.error(e.message[1]) sys.exit() if "Internal Server Error" in r.text: links_to_do.add(this_link) logger.debug("Internal server error on link: " + this_link) continue # parse data try: unicode_text = r.text.encode("latin1").decode("utf-8") except: logger.debug("Can't convert text reponse from latin1 to unicode on link: " + this_link) continue try: soup = BeautifulSoup(unicode_text) del r except: # If this goes wrong put link back into pile links_to_do.add(this_link) logger.debug("Can't convert to soup on link: " + this_link) continue # find list of establishments try: est = soup.find_all("establishmentdetail") except: links_to_do.add(this_link) logger.debug("Can't find establishmentdetail in link: " + this_link) continue # if len(est) < 1: failed_count_dict[this_link] += 1 if failed_count_dict[this_link] > 3: # Give up on this one counter_for_error += 1 logger.debug("Can't find any establishmentdetails in link even after 3 attempts: " + this_link) continue else: # Try again links_to_do.add(this_link) continue # for each establishment, find the data in each field and add to dictionary finalarr = [] for i in est: this_dict = {} for j in fieldslist: te = None try: te = i.find(j).text except: pass this_dict[j] = te finalarr.append(this_dict) # add dictionary to array # Check that the csv looks ok: df = pd.DataFrame(finalarr) # Does it have more than one row? if df.shape[0] < 1: links_to_do.add(this_link) logger.debug("Can't find any premesis in link: " + this_link) continue # Now write this to csv file file_name = this_link.replace(r"http://ratings.food.gov.uk/OpenDataFiles/", "").replace("en-GB.xml", "") file_name = os.path.join(csv_file_directory, date_string + "__" + file_name + ".csv") df.to_csv(file_name, encoding="utf-8", index=False) counter_for_done += 1 del df del finalarr gc.collect() for i in failed_count_dict: if failed_count_dict[i] > 3: logger.warning("the file " + i + " contained no establishments") logger.info("completed successfully")