def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'arkansas' assert roster_row['County'].lower() == 'carroll' browser.get(urlAddress) pages = [] #Use elements like below to find xpath keys and click through #Click I agree to terms time.sleep(np.random.uniform(5, 10, 1)) #Extract the HTML store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) pages.append(store_source) page_index += 1 finished = False while not finished: try: nextpage = browser.find_element_by_link_text('Next >') nextpage.click() time.sleep(np.random.uniform(5, 10, 1)) #Extract the HTML store_source = browser.page_source if store_source not in pages: pages.append(store_source) save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) page_index += 1 else: finished = True except: finished = True # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'mower' time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'maine' assert roster_row['County'].lower() == 'penobscot' req = requests.get(urlAddress) page_data = req.content save_to_s3(page_data, page_index, roster_row, filetype='pdf') logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.jailinmates_aspx(roster_row) #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(5, 10, 1)) #Use elements like below to find xpath keys and click through #Click "SELECT AN INMATE" elem = browser.find_element_by_xpath('//*[@id="dropdownMenuButton"]') elem.click() time.sleep(np.random.uniform(2, 4, 1)) #Extract the HTML# store_source = browser.page_source ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) browser.get(urlAddress) #Wait time.sleep(np.random.uniform(5, 10, 1)) #Extract the HTML link = browser.find_element_by_partial_link_text("Inmate List") link.click() time.sleep(np.random.uniform(5, 10, 1)) link = browser.find_element_by_partial_link_text("Jail Listing") link.click() pdf_data = browser.page_source ## Code to save a page and log appropriately save_to_s3(pdf_data, page_index, roster_row, filetype='pdf') logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.save_single_page( roster_row) # try to call a known crawler if possible browser.get(urlAddress) #Show all inmates instead of 6 per page time.sleep(np.random.uniform(5, 10, 1)) show_all = browser.find_element_by_xpath('//*[@id="inmatesPerPage"]') show_all.send_keys('All') logger.info('clicked "All"') #Wait time.sleep(np.random.uniform(15, 20, 1)) #Extract the HTML store_source = browser.page_source ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: """ Marion County Alabama, not Missouri OLD URL: http://mcsomo.com/current-inmates/ NEW URL: http://www.marionsoal.com/roster.php """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'alabama' assert roster_row['County'].lower() == 'marion' suffix = '?grp={}' req = requests.get(urlAddress) #Extract the HTML store_source = req.content soup = BeautifulSoup(store_source, 'lxml') #Extract number of inmates: inmate_roster = int( re.sub("\D", "", soup.find('span', { "class": "ptitles" }).text)) #20 entries per page; get number of pages by dividing by 20 rounding up. num_pages = math.ceil(inmate_roster / 20) for page_index in range(0, num_pages): time.sleep(np.random.uniform(5, 10, 1)) req = requests.get(urlAddress + suffix.format((page_index + 1) * 20)) store_source = req.content save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def omsweb_crawler(roster_row): try: logger = get_logger(roster_row) # Get a standard logger browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet if 'omsweb' not in urlAddress: raise Exception("Appears that this site _%s_ is not a public safety web site" % urlAddress) logger.info('using omsweb_crawler for _%s, %s_', roster_row['County'], roster_row['State']) # Log the chosen URL logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL browser.get(urlAddress) time.sleep(np.random.uniform(5,10,1)) pages = [] store_source = browser.page_source pages.append(store_source) finished = False while not finished: try: nextpage = browser.find_element_by_xpath('//*[@id="ext-gen110"]') nextpage.click() time.sleep(np.random.uniform(5,10,1)) store_source = browser.page_source if store_source not in pages: pages.append(store_source) else: finished = True except: finished = True #Close the browser browser.close() for store_source, page_index in zip(pages, range(len(pages))): save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) logger.info('complete!') except: try: try: page_index = len(pages) except: page_index = 0 record_error(message=str(errorMessage), roster_row=roster_row, page_number_within_scrape=page_index, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) browser.close() # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'louisiana' assert roster_row['County'].lower() == 'washington' suffix = '?grp={}' req = requests.get(urlAddress) #Extract the HTML store_source = req.content soup = BeautifulSoup(store_source, 'lxml') #Extract number of inmates: inmate_roster = int( re.sub("\D", "", soup.find('span', { "class": "ptitles" }).text)) #10 entries per page; get number of pages by dividing by 20, rounding up. num_pages = math.ceil(inmate_roster / 10) #Toggle local/s3 storage for page in range(0, num_pages): page_index += 1 time.sleep(np.random.uniform(5, 10, 1)) req = requests.get(urlAddress + suffix.format((page + 1) * 10)) store_source = req.content save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) ### req = requests.get(urlAddress) page_data = req.content save_to_s3(page_data, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code page_index = 1 assert roster_row['State'].lower() == 'alabama' assert roster_row['County'].lower() == 'houston' time.sleep(np.random.uniform(5, 10, 1)) dropdown = browser.find_element_by_xpath( '//*[@id="gvInmates_DXPagerBottom_PSI"]') dropdown.click() dropdown.send_keys(Keys.DOWN) time.sleep(np.random.uniform(5, 10, 1)) dropdown = browser.find_element_by_xpath( '//*[@id="gvInmates_DXPagerBottom_PSI"]') dropdown.send_keys(Keys.DOWN) time.sleep(np.random.uniform(5, 10, 1)) dropdown = browser.find_element_by_xpath( '//*[@id="gvInmates_DXPagerBottom_PSI"]') dropdown.send_keys(Keys.RETURN) time.sleep(np.random.uniform(5, 10, 1)) #Extract the HTML store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ OLD URL: https://www.boonesheriff.com/mobile/roster.php UPDATED URL: https://www.boonesheriff.com/roster.php """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'arkansas' assert roster_row['County'].lower() == 'boone' suffix = '?grp={}' req = requests.get(urlAddress) #Extract the HTML store_source = req.content soup = BeautifulSoup(store_source, 'lxml') #Extract number of inmates: inmate_roster = int(re.sub("\D", "", soup.find('h2', {"class":"large-6 columns ptitles"}).text)) #10 entries per page; get number of pages by dividing by 10, rounding up. num_pages = math.ceil(inmate_roster/10) #Mark the time the file is collected for page in range(0, num_pages): page_index += 1 time.sleep(np.random.uniform(5,10,1)) req = requests.get(urlAddress+suffix.format((page+1)*10)) store_source = req.content save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ PDF LINK IS OUT OF DATE OLD URL: http://www.yolocountysheriff.com/wp-content/uploads/2019/02/January-2019-webiste-upload.pdf NEW URL: https://www.yolocountysheriff.com/services/jail/booking-statistics/ """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'california' assert roster_row['County'].lower() == 'yolo' req = requests.get(urlAddress) store_source = req.content soup = BeautifulSoup(store_source, 'lxml') link_to_pdf = soup.find('article', {'id': 'post-356'}) pdf_url = link_to_pdf.find_all('a')[1]['href'] time.sleep(np.random.uniform(5, 10, 1)) req2 = requests.get(pdf_url) pdf_data = req2.content save_to_s3(pdf_data, page_index, roster_row, filetype='pdf') logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'louisiana' assert roster_row['County'].lower() == 'caldwell' time.sleep(np.random.uniform(5,10,1)) """This agency is providing this roster of incarcerated offenders to the public and law enforcement in the interest of public safety. This information shall not be considered, or used as, a public document, or official document, and no other publication or copying of this information is allowed without the express written consent of this agency. Any unauthorized use of this information is forbidden and subject to criminal prosecution.""" #Click I agree to terms elem = browser.find_element_by_xpath('//*[@id="OkButton"]') elem.click() #Wait time.sleep(np.random.uniform(10,15,1)) #Extract the HTML store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'louisiana' assert roster_row['County'].lower() == 'st. tammany' pages = [] finished = False #Pagination is done by number page_index = 1 while finished == False: try: nextpage = browser.find_element_by_link_text('›') nextpage.click() page_index += 1 time.sleep(np.random.uniform(5, 10, 1)) #Extract the HTML store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) except: finished = True ### # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ If there are no inmates' names for a particular letter, the previous letter will be stored. """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'alabama' assert roster_row['County'].lower() == 'shelby' pages = [] letters = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source pages.append(store_source) for letter in letters[0:]: pagelink = browser.find_element_by_xpath( '//*[@id="btn_{}"]'.format(letter)) pagelink.click() time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source save_to_s3(browser.page_source, letter, roster_row) logger.info('Saved page _%s_', letter) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def roster_php(roster_row, num_per_page=20): try: logger = get_logger(roster_row) # Get a standard logger browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet if 'roster.php' not in urlAddress: raise Exception("Appears that this site _%s_ is not a roster.php website - using the wrong crawler" % urlAddress) page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Choosing roster_php crawler') # Name crawler logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL suffix = '?grp={}' browser.get(urlAddress) #Use elements like below to find xpath keys and click through time.sleep(np.random.uniform(300,400,1)) store_source = browser.page_source soup = BeautifulSoup(store_source, 'lxml') try: inmate_roster = int(re.sub("\D", "", soup.find('span', {"class":"ptitles"}).text)) #10 entries per page; get number of pages by dividing by 10, rounding up. except: inmate_roster = int(re.sub("\D", "", soup.find('h2', {"class":"large-6 columns ptitles"}).text)) #10 entries per page; get number of pages by dividing by 10, rounding up. #">Inmate Roster (151)</h2> num_pages = math.ceil(inmate_roster/num_per_page) pages = [] for page in range(0, num_pages): time.sleep(np.random.uniform(5,10,1)) url = urlAddress+suffix.format((page+1)*num_per_page) logger.info('getting url _%s_', url) browser.get(url) store_source = browser.page_source logger.info('Found page _%s_', page) pages.append(store_source) for store_source in pages: page_index += 1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) except Exception as errorMessage: try: record_error(message=str(errorMessage), roster_row=roster_row, page_number_within_scrape=page_index, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) browser.close() # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'norman' #Wait time.sleep(np.random.uniform(5, 10, 1)) pdf_link = browser.find_element_by_xpath( "//*[contains(@href, 'Inmate_Roster')]") pdf_link.click() #new_tab = browser.find_element_by_xpath('/html/body/div/div/div/table/tbody/tr/td[2]/div/div[2]/div/a') #new_tab.click() #Wait time.sleep(np.random.uniform(5, 10, 1)) pdf_data = browser.page_source save_to_s3(pdf_data, page_index, roster_row, filetype='pdf') logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page #browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'arkansas' assert roster_row['County'].lower() == 'ouachita' suffix = '?id={}' letters = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] pages = [] for letter in letters: req = requests.get('http://' + urlAddress + suffix.format(letter)) store_source = req.content pages.append(store_source) save_to_s3(store_source, letter, roster_row) logger.info('Saved page _%s_', letter) time.sleep(np.random.uniform(5, 10, 1)) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'california' assert roster_row['County'].lower() == 'amador' time.sleep(np.random.uniform(5, 10, 1)) elem = browser.find_element_by_xpath('//*[@id="cmdCloseMessage"]') elem.click() time.sleep(np.random.uniform(5, 10, 1)) inmates_link = browser.find_element_by_xpath('//*[@id="InmatesMenu"]') inmates_link.click() time.sleep(np.random.uniform(5, 10, 1)) save_to_s3(browser.page_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ OLD URL: http://www.calcoso.org/divisions-jail-inmate-roster/ UPDATED URL: https://www.calcoso.org/inmate-roster/ """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code page_index = 1 assert roster_row['State'].lower() == 'alabama' assert roster_row['County'].lower() == 'clarke' req = requests.get(urlAddress) save_to_s3(req.content, page_index, roster_row) logger.info('Saved page _%s_', page_index) store_source = req.content # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ IFRAME SITE OLD URL: https://www.co.washington.ar.us/government/departments-f-z/sheriff/detention-information/detainee-roster-detailed NEW URL: https://www.so.washington.ar.us/res/DAlphaRoster.aspx """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'arkansas' assert roster_row['County'].lower() == 'washington' req = requests.get(urlAddress) store_source = req.content save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ IFRAME SITE OLD URL: http://randolphcountyso.org/inmates.html NEW URL: http://randolphcountyso.org/cur_inmates.html """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'alabama' assert roster_row['County'].lower() == 'randolph' page_index = 0 req = requests.get(urlAddress) store_source = req.content save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ Page has been moved. OLD URL: http://www.leecosheriff.com/Inmates/ICURRENT.HTM NEW URL: https://tcsi-roster.azurewebsites.net/default.aspx?i=26&code=Lee&type=roster """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'arkansas' assert roster_row['County'].lower() == 'lee' req = requests.get(urlAddress) store_source = req.content save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ IFRAME SITE OLD URL: http://phillipscosheriff.com/Inmates NEW URL: http://phillips.pixelpowerhaus.net/ """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'colorado' assert roster_row['County'].lower() == 'phillips' store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'otter tail' """ OLD URL: http://www.co.otter-tail.mn.us/991/In-Custody-List NEW URL: https://www.ottertailcounty.us/sheriff/report/custody%20list.rpt.html """ time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'california' assert roster_row['County'].lower() == 'kings' time.sleep(np.random.uniform(5, 10, 1)) dropdown = browser.find_element_by_xpath( '//*[@id="DataTables_Table_0_length"]/label/select') dropdown.send_keys('All', Keys.RETURN) time.sleep(np.random.uniform(5, 10, 1)) save_to_s3(browser.page_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'louisiana' assert roster_row['County'].lower() == 'iberville' time.sleep(np.random.uniform(5, 10, 1)) search = browser.find_element_by_xpath( '//*[@id="right"]/center/form/table/tbody/tr[14]/td[2]/input') search.click() #Extract the HTML store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ OLD URL: https://www.elbertcountysheriff.com/detention/inmateinfo/ UPDATED URL: https://www.inmateinfo.net/inmateinfo.php?org=ecso """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'colorado' assert roster_row['County'].lower() == 'elbert' store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'crow wing' #Use elements like below to find xpath keys and click through # NOTE: Looks like there's a site problem with a 404. Here's the FAQ: # https://www.crowwing.us/Faq.aspx?QID=297 to pointing: # https://www.crowwing.us/Faq.aspx?QID=297 store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass