def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function #browser = get_browser() # Get a standard browser #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.save_single_page( roster_row) # try to call a known crawler if possible ## Code to save a page and log appropriately #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'otter tail' """ OLD URL: http://www.co.otter-tail.mn.us/991/In-Custody-List NEW URL: https://www.ottertailcounty.us/sheriff/report/custody%20list.rpt.html """ time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger ########## # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.omsweb_crawler(roster_row) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'becker' time.sleep(np.random.uniform(5, 10, 1)) expand = browser.find_element_by_xpath( '//*[@id="main-content"]/div[3]/a[1]') expand.click() #Extract the HTML store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Extract the HTML using basic_multipage crawlers.basic_multipage( roster_row, next_type="xpath", next_string='//*[@id="JailRosterbuttondiv"]/a[8]' ) # try to call a known crawler if possible # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'morrison' """ Old URL: https://www.co.morrison.mn.us/?SEC=35BA1570-F608-40A9-9571-6968DD357BF6 New URL: https://incustody.co.morrison.mn.us/ """ time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'california' assert roster_row['County'].lower() == 'hayward' time.sleep(np.random.uniform(5, 10, 1)) elem = browser.find_element_by_xpath( "//span[contains(text(),'I Agree')]") elem.click() time.sleep(np.random.uniform(5, 10, 1)) save_to_s3(browser.page_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'crow wing' #Use elements like below to find xpath keys and click through # NOTE: Looks like there's a site problem with a 404. Here's the FAQ: # https://www.crowwing.us/Faq.aspx?QID=297 to pointing: # https://www.crowwing.us/Faq.aspx?QID=297 store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ OLD URL: https://www.elbertcountysheriff.com/detention/inmateinfo/ UPDATED URL: https://www.inmateinfo.net/inmateinfo.php?org=ecso """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'colorado' assert roster_row['County'].lower() == 'elbert' store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'faribault' """SITE USES IFRAME OLD URL: http://www.frcsd.org/index.php?option=com_wrapper&view=wrapper&Itemid=7 NEW URL: http://www.bevcommasp.com/fcjail/custodylistFar.rpt.html """ store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'isanti' """SITE USES IFRAME ORIGINAL: 'https://www.co.isanti.mn.us/425/In-Custody' SOURCE: 'https://sheriff.co.isanti.mn.us/letg/custody.html' """ store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.omsweb_crawler( roster_row) # try to call a known crawler if possible # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def save_single_page(roster_row, filetype='html'): try: logger = get_logger(roster_row) # Get a standard logger browser = get_browser() # Get a standard browser logger.info('using save_single_html_page for _%s, %s', roster_row['County'], roster_row['State']) # Log the chosen URL urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #Boilerplate code setting up logger, getting initial URL time.sleep(np.random.uniform(5,10,1)) #Given the urlAddress passed to the function we will navigate to the page if filetype=='html': browser.get(urlAddress) store_source = browser.page_source elif filetype=='xls': browser.get(urlAddress) store_source = browser.page_source else: response = requests.get(urlAddress) response.raise_for_status() store_source = response.content save_to_s3(store_source, page_index, roster_row, filetype=filetype) # Safe result to s3. This call includes logging and file formatting logger.info('Saved page _%s_', page_index) return True except Exception as errorMessage: try: record_error(message=str(errorMessage), roster_row=roster_row, page_number_within_scrape=page_index, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) browser.close() # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'louisiana' assert roster_row['County'].lower() == 'natchitoches' #Wait time.sleep(np.random.uniform(5,10,1)) #Extract the HTML store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code page_index = 1 assert roster_row['State'].lower() == 'alabama' assert roster_row['County'].lower() == 'cleburne' req = requests.get(urlAddress) save_to_s3(req.content, page_index, roster_row) logger.info('Saved page _%s_', page_index) store_source = req.content # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function #browser = get_browser() # Get a standard browser #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) logger = get_logger(roster_row) # Get a standard logger browser = get_browser() # Get a standard browser logger.info('using custom crawler for _%s, %s', roster_row['County'], roster_row['State']) # Log the chosen URL urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet browser.get(urlAddress) logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #Wait time.sleep(np.random.uniform(5,10,1)) letters = ['A', 'E', 'I', 'N', 'O', 'R', 'U', 'Y'] pages = [] indices = [] for letter in letters: subpages = [] searchbox = browser.find_element_by_xpath('//*[@id="dnn_ctr1295_ViewInmateSearch_txtLastName"]') searchbox.send_keys(Keys.RIGHT, Keys.BACKSPACE) searchbox.send_keys(letter, Keys.RETURN) #Wait time.sleep(np.random.uniform(5,10,1)) store_source = browser.page_source subpages.append(store_source) indices.append(letter+'_'+str(len(subpages))) finished = False while not finished: try: nextpage = browser.find_element_by_partial_link_text('Next Page') nextpage.click() time.sleep(np.random.uniform(10,15,1)) store_source = browser.page_source if store_source not in subpages: subpages.append(store_source) indices.append(letter+'_'+str(len(subpages))) else: finished = True except: finished = True for element in subpages: pages.append(element) for store_source, page_index in zip(pages, indices): save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) ## Code to save a page and log appropriately #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(7,10,1)) #Extract the HTML# store_source = browser.page_source ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) #Finding the last page page=0 soup = BeautifulSoup(store_source, 'lxml') try: x=soup.findAll("span", {"class":"k-pager-info k-label"}) x=list(x) x=x[1] page=str(x) page=page[int(page.index("of"))+3:page.index(" items</span>")] page=int(page) page=int(page/10)+(page % 10 > 0) except ValueError as errorMessage: page=0 print("Please review script for this county.") try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1) #Crawling through all the pages string = str(1) for i in range(2,page+1): elem = browser.find_element_by_xpath('//*[@id="allInmatesGrid"]/div[1]/a[3]/span') elem.click() time.sleep(np.random.uniform(5,7,1)) store_source = browser.page_source string = str(i) ## Code to save a page and log appropriately page_index = int(string) - 1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: """ OLD URL: http://www.calcoso.org/divisions-jail-inmate-roster/ UPDATED URL: https://www.calcoso.org/inmate-roster/ """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'alabama' assert roster_row['County'].lower() == 'lamar' letters = ['A', 'E', 'I', 'N', 'O', 'R', 'U', 'Y'] #Extract the HTML page_index = 1 #Create empty list to store page sources: pages = [] #Create empty list to store index of {letter}_{pagenumber} letters_pages = [] for letter in letters: browser.get(urlAddress) #Use elements like below to find xpath keys and click through #Click I agree to terms time.sleep(np.random.uniform(5, 10, 1)) searchbox = browser.find_element_by_xpath( '/html/body/div/div/div/div[2]/div[2]/div[2]/form/div[2]/input' ) searchbox.send_keys(letter) searchbutton = browser.find_element_by_xpath( '/html/body/div/div/div/div[2]/div[2]/div[2]/form/div[4]/button' ) searchbutton.click() #Wait time.sleep(np.random.uniform(5, 10, 1)) #Default variables for entry with no navigation bar: split_pages = False page_index = 1 finished = False #Check for navigation bar: try: nextpage = browser.find_element_by_link_text('Next →') '/html/body/div/div/div/div[2]/div[3]/div[12]/ul/li[7]/a' split_pages = True except: pass page_name = letter + '_{}'.format(page_index) #Save main page save_to_s3(browser.page_source, page_name, roster_row) logger.info('Saved page _%s_', page_name) #Perform subroutine if multiple pages are present if split_pages == True: while finished == False: try: nextpage = browser.find_element_by_link_text('Next →') nextpage.click() page_index += 1 #Wait time.sleep(np.random.uniform(5, 10, 1)) #Extract the HTML page_name = letter + '_{}'.format(page_index) save_to_s3(browser.page_source, page_name, roster_row) logger.info('Saved page _%s_', page_name) pages.append(store_source) except: finished = True # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function # browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) ##### We will use requests for this county, not selenium ##### r = requests.get(urlAddress) store_source = r.content time.sleep(np.random.uniform(5, 7, 1)) # Formatting as beautiful soup soup = BeautifulSoup(store_source, 'lxml') # Pulling all the hrefs website_link = [] for link in soup.findAll('a'): data_link = link.get('href') website_link.append(data_link) print(data_link) # Removing all the link that doesn't have connection with current arrestLogs website_df = pd.DataFrame(website_link) website_df.columns = ['link'] website_df = website_df[website_df.link.str.contains( "arrestLogs/current/")] # Resetting index website_df = website_df.reset_index() website_df = website_df.drop(columns=['index']) # Pulling the information on the specific link that we parsed (The latest current arrest logs) r = requests.get('http://www.honolulupd.org/information/' + website_df["link"].iloc[0]) time.sleep(np.random.uniform(5, 7, 1)) # Pulling the content store_source = r.content ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'louisiana' assert roster_row['County'].lower() == 'st. charles' time.sleep(np.random.uniform(5, 10, 1)) lastpage = False pages = [] names = [] #Get first page store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) soup = BeautifulSoup(store_source, 'lxml') firstentry = soup.find('td', {'ordered-tag': 'name'}) names.append(firstentry.text) pages.append(store_source) while lastpage == False: time.sleep(np.random.uniform(5, 10, 1)) #Navigate to next page try: nextpage = browser.find_element_by_xpath( '//*[@id="primary-container"]/div/div/div/zt-collectionview/div[1]/div/div[2]/div[1]/button[2]' ) nextpage.click() except: lastpage = True time.sleep(np.random.uniform(5, 10, 1)) #Extract the HTML store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) soup = BeautifulSoup(store_source, 'lxml') firstentry = soup.find('td', {'ordered-tag': 'name'}) if names[-1] == firstentry.text: lastpage = True else: pages.append(store_source) names.append(firstentry.text) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(7,10,1)) num_rows_found = 1000000 rownum = 1 store_source = browser.page_source row_sources = [] row_texts = [] while rownum < num_rows_found: # Clicking to show all options. The page actually is buggy. # If you click on detail, and click "back", then there is # only 10 rows. And clicking "All" again doesn't help. YOu # have to click a different number of rows first... elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select') elem.click() time.sleep(np.random.uniform(1,2,1)) elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select/option[3]') elem.click() time.sleep(np.random.uniform(1,2,1)) elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select') elem.click() time.sleep(np.random.uniform(1,2,1)) elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select/option[5]') elem.click() time.sleep(np.random.uniform(1,2,1)) elem = browser.find_element_by_xpath('//*[@id="refresh_tblII"]/div/span') elem.click() time.sleep(np.random.uniform(1,2,1)) #initial_rows = browser.find_elements_by_xpath('/html/body/form/table/tbody/tr[2]/td/table/tbody/tr/td[2]/div/div[2]/div[2]/div/div[3]/div[3]/div/table/tbody/tr') #Extract the HTML# rows = browser.find_elements_by_xpath('/html/body/form/table/tbody/tr[2]/td/table/tbody/tr/td[2]/div/div[2]/div[2]/div/div[3]/div[3]/div/table/tbody/tr') logger.info('found %s rows on parse', len(rows)) if num_rows_found == 1000000: num_rows_found = len(rows) logger.info('Found _%s_ total records', num_rows_found) row_texts.append(rows[rownum].text) elem = rows[rownum].click() time.sleep(np.random.uniform(1,5,1)) row_sources.append(browser.page_source) logger.info('Logged id page _%s_', len(row_sources)) browser.execute_script("window.history.go(-1)") time.sleep(np.random.uniform(1,5,1)) rownum += 1 save_to_s3(store_source, "MAINPAGE", roster_row) logger.info('Saved page _%s_', page_index) save_pages_array(row_sources, roster_row) ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'california' assert roster_row['County'].lower() == 'lake' pages = [] #Use elements like below to find xpath keys and click through #Click I agree to terms time.sleep(np.random.uniform(20, 30, 1)) #Extract the HTML is_name = browser.find_element_by_xpath( '//*[@id="MainContent_gvIncustody_Label3_0"]') store_source = browser.page_source pages.append(store_source) save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) page_index = 2 finished = False while not finished: try: nextpage = browser.find_element_by_link_text(str(page_index)) nextpage.click() time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source if store_source not in pages: pages.append(store_source) save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) else: finished = True page_index += 1 except: finished = True # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ IFRAME SITE: OLD URL: https://www.craigheadso.org/roster_custom.php NEW URL: https://www.myr2m.com/craigheadroster/ """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'arkansas' assert roster_row['County'].lower() == 'craighead' #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) #Use elements like below to find xpath keys and click through #Click I agree to terms time.sleep(np.random.uniform(20, 30, 1)) pages = [] page_size = browser.find_element_by_xpath( '//*[@id="ContentPlaceHolder1_ddlPageSize"]') page_size.send_keys('30', Keys.RETURN) time.sleep(np.random.uniform(20, 30, 1)) #Extract the HTML store_source = browser.page_source pages.append(store_source) save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) page_index += 1 finished = False while not finished: next_page = browser.find_element_by_xpath( '//*[@id="ContentPlaceHolder1_cmdNext2"]') try: next_page.click() except: pass time.sleep(np.random.uniform(20, 30, 1)) store_source = browser.page_source if store_source not in pages: pages.append(store_source) save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) page_index += 1 else: finished = True # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ OLD URL: https://www.inmateaid.com/inmate-profile-search UPDATED URL: https://chargesandbonds.arapahoegov.com/ """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'colorado' assert roster_row['County'].lower() == 'arapahoe' pages = [] letters = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] for letter in letters: #Search for last names starting with selected letter lastname = browser.find_element_by_xpath( '//*[@id="ContentPlaceHolder1_txtLast"]') lastname.send_keys(letter) time.sleep(np.random.uniform(2, 5, 1)) search = browser.find_element_by_xpath( '//*[@id="ContentPlaceHolder1_btnSearchEn"]') search.click() #Store first page per letter store_source = browser.page_source page_index = 1 pagename = letter + '_' + str(page_index) pages.append(store_source) save_to_s3(store_source, pagename, roster_row) logger.info('Saved page _%s_', pagename) finished = False #Iterate over second through last pages. #Stored page will be added to collection if not already in it. #Else, the next letter will be called. while not finished: page_index += 1 try: nextpage = browser.find_element_by_xpath( '//*[@id="ContentPlaceHolder1_btnNext"]') nextpage.click() time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source if store_source not in pages: pages.append(store_source) pagename = letter + '_' + str(page_index) save_to_s3(store_source, pagename, roster_row) logger.info('Saved page _%s_', pagename) letters_pages.append(pagename) else: finished = True except: finished = True # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(7, 10, 1)) #Assume there is a second page more_results = True #While a second page exists, click the "load more" button while more_results == True: try: browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);") load_more = browser.find_element_by_xpath( '//*[@id="LoadMoreButton"]/p[1]') load_more.click() time.sleep(np.random.uniform(5, 10, 1)) except: more_results = False finished = False while not finished: try: expandable = browser.find_element_by_xpath( "//td[contains(text(),'[+]')]") except: finished = True time.sleep(np.random.uniform(0.5, 1, 1)) try: expandable.click() except: pass time.sleep(np.random.uniform(0.5, 1, 1)) # Extract the HTML store_source = browser.page_source ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(7, 10, 1)) #Click I Agree try: elem = browser.find_element_by_xpath('//*[@id="submit2"]') elem.click() time.sleep(np.random.uniform(2, 4, 1)) except NoSuchElementException: time.sleep(np.random.uniform(2, 4, 1)) #Extract the HTML# store_source = browser.page_source #Select institution (CLAYTON COUNTY JAIL) soup = BeautifulSoup(store_source, 'lxml') nameList = soup.findAll("select", {"name": "vCurrentInstitution"}) for i in nameList: nameList = str(i) nameList = re.sub('</option>', "", nameList) nameList = re.sub('</select>', "", nameList) nameList = re.sub('"', "", nameList) nameList = re.sub('>', " ", nameList) nameList = nameList.split("<option value=") regex = re.compile("^CLAYTON COUNTY JAIL") list_index = [ i for i, item in enumerate(nameList) if re.search(regex, item) ] list_index = int(list_index[0]) #Click institution elem = browser.find_element_by_xpath('//*[@id="vCurrentInstitution"]') elem.click() time.sleep(np.random.uniform(2, 4, 1)) #Click the name of the jail elem = browser.find_element_by_xpath( '//*[@id="vCurrentInstitution"]/option[' + str(list_index + 1) + ']') elem.click() time.sleep(np.random.uniform(2, 4, 1)) #Click Submit Form elem = browser.find_element_by_xpath('//*[@id="NextButton2"]') elem.click() time.sleep(np.random.uniform(4, 6, 1)) # Extract the HTML store_source = browser.page_source ## Code to save the first page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) #Finding the last page soup = BeautifulSoup(store_source, 'lxml') page = 0 for link in soup.findAll("span", {"class": "oq-nav-btwn"}): page = str(link.text) page = re.sub("Page 1 of ", "", page) page = int(page) #Crawling through all the pages string = str(1) try: for i in range(2, page + 1): elem = browser.find_element_by_xpath('//*[@id="oq-nav-nxt"]') elem.click() time.sleep(np.random.uniform(5, 7, 1)) store_source = browser.page_source string = str(i) ## Code to save a page and log appropriately page_index = int(string) - 1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) except NoSuchElementException as errorMessage: print("Please review this crawler") time.sleep(np.random.uniform(2, 4, 1)) try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(7, 10, 1)) #Click to select all the display options elem = browser.find_element_by_xpath( '//*[@id="pager_center"]/table/tbody/tr/td[5]/select') elem.click() time.sleep(np.random.uniform(2, 4, 1)) #Click to display all elem = browser.find_element_by_xpath( '//*[@id="pager_center"]/table/tbody/tr/td[5]/select/option[5]') elem.click() time.sleep(np.random.uniform(4, 7, 1)) # Extract the HTML store_source = browser.page_source ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) time.sleep(np.random.uniform(7, 10, 1)) #Extract the HTML# store_source = browser.page_source ## Code to save the first page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) #Finding the last page soup = BeautifulSoup(store_source, 'lxml') page = 0 for link in soup.findAll("div", {"class": "loca-search-head text-center"}): page = str(link.text) page = re.sub(' Results for "_"', "", page) page = int(page) / 10 page = math.ceil(page) #Crawling through all the pages string = str(1) for i in range(2, page + 1): if i > 30: print("Exceeds 300 inmates") elif i == 2: elem = browser.find_element_by_xpath( '/html/body/div/div/div/div[2]/div[3]/div[12]/ul/li[3]/a') elem.click() time.sleep(np.random.uniform(3, 5, 1)) store_source = browser.page_source string = str(i) ## Code to save the page and log appropriately page_index = int(string) - 1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) elif i == 3: elem = browser.find_element_by_xpath( '/html/body/div/div/div/div[2]/div[3]/div[12]/ul/li[4]/a') elem.click() time.sleep(np.random.uniform(3, 5, 1)) store_source = browser.page_source string = str(i) ## Code to save the page and log appropriately page_index = int(string) - 1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) elif i == 4: elem = browser.find_element_by_xpath( '/html/body/div/div/div/div[2]/div[3]/div[12]/ul/li[5]/a') elem.click() time.sleep(np.random.uniform(3, 5, 1)) store_source = browser.page_source string = str(i) ## Code to save the page and log appropriately page_index = int(string) - 1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) elif i >= 5: elem = browser.find_element_by_xpath( '/html/body/div/div/div/div[2]/div[3]/div[12]/ul/li[6]/a') elem.click() time.sleep(np.random.uniform(3, 5, 1)) store_source = browser.page_source string = str(i) ## Code to save the page and log appropriately page_index = int(string) - 1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(7,10,1)) #Extract the HTML# store_source = browser.page_source ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) #Finding the last page soup = BeautifulSoup(store_source, 'lxml') page=0 for link in soup.findAll("div", {"id":"ext-comp-1010"}): page=str(link.text) page=re.sub("of ", "", page) page=int(page) #Crawling through all the pages string = str(1) for i in range(2,page+1): elem = browser.find_element_by_xpath('//*[@id="ext-gen110"]') elem.click() time.sleep(np.random.uniform(7,10,1)) store_source = browser.page_source string=str(i) ## Code to save a page and log appropriately page_index = int(string)-1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(7, 10, 1)) #Extract the HTML# store_source = browser.page_source ## Code to save the first page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # Finding the last page soup = BeautifulSoup(store_source, 'lxml') page = 0 for link in soup.findAll("td", {"style": "white-space:nowrap;"}): page = str(link.text) page = re.sub("Page 1 of ", "", page) page = page[:page.index(" ")] page = int(page) #Crawling through all the pages string = str(1) if page == 0: print("No inmate") if page <= 10: for i in range(2, page + 1): try: elem = browser.find_element_by_xpath( '//*[@id="Content_MainContent_ASPxGridView3_DXPagerTop"]/tbody/tr/td/table/tbody/tr/td[' + str((i * 2) + 3) + ']') elem.click() time.sleep(np.random.uniform(5, 7, 1)) store_source = browser.page_source string = str(i) ## Code to save a page and log appropriately page_index = int(string) - 1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # If error then webcrawl needs update except NoSuchElementException as errorMessage: print("Please review script for this county.") try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1) elif page > 10: for i in range(2, page + 1): try: elem = browser.find_element_by_xpath( '//*[@id="Content_MainContent_ASPxGridView3_DXPagerTop"]/tbody/tr/td/table/tbody/tr/td[27]/img' ) elem.click() time.sleep(np.random.uniform(5, 7, 1)) store_source = browser.page_source string = str(i) ## Code to save a page and log appropriately page_index = int(string) - 1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # If error -> Sometimes the image is not td[27] but td[29] except NoSuchElementException: elem = browser.find_element_by_xpath( '//*[@id="Content_MainContent_ASPxGridView3_DXPagerTop"]/tbody/tr/td/table/tbody/tr/td[29]/img' ) elem.click() time.sleep(np.random.uniform(5, 7, 1)) store_source = browser.page_source string = str(i) ## Code to save a page and log appropriately page_index = int(string) - 1 save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)