def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function #browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) """ OLD URL:http://hcsosearch.hendersoncountync.org/hcso/jail_Search.asp NEW URL: http://hcsosearch.hendersoncountync.org/hcso/jail_Results.asp?autocomplete1=&dmxCalendar_1=&dmxCalendar_2=&S_Age=&Search.y=10&S_PrimaryOffense=&Search.x=61 """ #urlAddress = roster['Working Link'].values[index] suffix = '&offset={}' pages = [] req = requests.get(urlAddress) store_source = req.content #Extract number of inmates: soup = BeautifulSoup(store_source, 'lxml') inmate_count = soup.find('div', { 'class': 'WADAResultsCount' }).text.split( 'Records\r\n \r\n 1 to\r\n \r\n 10 of\r\n \r\n ' ) inmate_count = int(inmate_count[1].strip()) #Convert number of inmates to number of pages: page_count = math.ceil(inmate_count / 10) #Substitute page number times ten into suffix for i in range(0, page_count): #Wait time.sleep(np.random.uniform(5, 10, 1)) #Store html req = requests.get(urlAddress + suffix.format(i * 10)) store_source = req.content pages.append(store_source) ## Code to save a page and log appropriately save_pages_array(pages, roster_row) #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(7,10,1)) num_rows_found = 1000000 rownum = 1 store_source = browser.page_source row_sources = [] row_texts = [] while rownum < num_rows_found: # Clicking to show all options. The page actually is buggy. # If you click on detail, and click "back", then there is # only 10 rows. And clicking "All" again doesn't help. YOu # have to click a different number of rows first... elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select') elem.click() time.sleep(np.random.uniform(1,2,1)) elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select/option[3]') elem.click() time.sleep(np.random.uniform(1,2,1)) elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select') elem.click() time.sleep(np.random.uniform(1,2,1)) elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select/option[5]') elem.click() time.sleep(np.random.uniform(1,2,1)) elem = browser.find_element_by_xpath('//*[@id="refresh_tblII"]/div/span') elem.click() time.sleep(np.random.uniform(1,2,1)) #initial_rows = browser.find_elements_by_xpath('/html/body/form/table/tbody/tr[2]/td/table/tbody/tr/td[2]/div/div[2]/div[2]/div/div[3]/div[3]/div/table/tbody/tr') #Extract the HTML# rows = browser.find_elements_by_xpath('/html/body/form/table/tbody/tr[2]/td/table/tbody/tr/td[2]/div/div[2]/div[2]/div/div[3]/div[3]/div/table/tbody/tr') logger.info('found %s rows on parse', len(rows)) if num_rows_found == 1000000: num_rows_found = len(rows) logger.info('Found _%s_ total records', num_rows_found) row_texts.append(rows[rownum].text) elem = rows[rownum].click() time.sleep(np.random.uniform(1,5,1)) row_sources.append(browser.page_source) logger.info('Logged id page _%s_', len(row_sources)) browser.execute_script("window.history.go(-1)") time.sleep(np.random.uniform(1,5,1)) rownum += 1 save_to_s3(store_source, "MAINPAGE", roster_row) logger.info('Saved page _%s_', page_index) save_pages_array(row_sources, roster_row) ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) browser.get(urlAddress) #Use elements like below to find xpath keys and click through #Navigate to jail log time.sleep(np.random.uniform(5, 10, 1)) logger.info('Clicking on jail log') jaillog = browser.find_element_by_xpath( '//*[@id="AutoNumber2"]/tbody/tr[7]/td[2]/p/font/a[1]') jaillog.click() #Agree to disclaimer; click "Continue" time.sleep(np.random.uniform(3, 8, 1)) logger.info('Clicking continue') agree = browser.find_element_by_xpath( '/html/body/form/div/center/table/tbody/tr/td/center/table/tbody/tr[2]/td/a' ) agree.click() #View all inmates: time.sleep(np.random.uniform(3, 8, 1)) logger.info('clicking view all') viewall = browser.find_element_by_xpath( '/html/body/div/center/table/tbody/tr/td/form/table/tbody/tr[3]/td/table/tbody/tr[2]/td[2]/input' ) viewall.click() #Create list for stored page sources: pages = [] store_source = browser.page_source pages.append(store_source) logger.info('stored page _%s_', len(pages)) time.sleep(np.random.uniform(3, 8, 1)) #Site displays 16 records per page. #Click Next until page has already been stored finished = False while finished == False: nextpage = browser.find_element_by_xpath( '/html/body/div/center/table/tbody/tr/td/form/table/tbody/tr[4]/td/table/tbody/tr[1]/td[2]/table/tbody/tr[2]/td/table/tbody/tr[2]/td[2]/table/tbody/tr[2]/td/a[5]/img' ) nextpage.click() time.sleep(np.random.uniform(3, 8, 1)) #Select top button so identical pages won't register as different topbutton = browser.find_element_by_xpath('//*[@id="BROWSE_1$1"]') topbutton.click() store_source = browser.page_source if store_source in pages: finished = True else: pages.append(store_source) logger.info('stored page _%s_', len(pages)) ## Code to save a page and log appropriately save_pages_array(pages, roster_row) #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) pages = [] #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) #Use elements like below to find xpath keys and click through #Wait time.sleep(np.random.uniform(5, 10, 1)) """Session expired OR System Exit Called Navigate to Login Screen? Cancel/OK """ #"OK" bypass = browser.switch_to.alert bypass.accept() #Wait time.sleep(np.random.uniform(1, 2, 1)) #Navigate to roster roster = browser.find_element_by_xpath( '//*[@id="InmateInfoMenuHeader"]') roster.click() #Extract the HTML store_source = browser.page_source pages.append(store_source) finished = False while finished == False: try: next_page = browser.find_element_by_xpath( '//*[@id="nextPageButton"]') next_page.click() except: finished = True #Wait time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source #Check whether current page has already been stored; if not, store if store_source in pages: finished = True else: pages.append(store_source) ## Code to save a page and log appropriately save_pages_array(pages, roster_row) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: """ IFRAME SITE OLD URL: http://www.co.tillamook.or.us/gov/Jail/Division-inmateList.html UPDATED URL: https://www.co.tillamook.or.us/gov/jail/inmatelist/inmate.htm """ logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) browser.get(urlAddress) #Use elements like below to find xpath keys and click through pages = [] time.sleep(np.random.uniform(5, 10, 1)) dropdown = browser.find_element_by_xpath( '//*[@id="inmateList_length"]/label/select') dropdown.send_keys('100') logger.info('selecting 100 per page') #Extract the HTML store_source = browser.page_source pages.append(store_source) logger.info('Storing page _%s_', len(pages)) finished = False while finished == False: time.sleep(np.random.uniform(5, 10, 1)) nextpage = browser.find_element_by_xpath( '//*[@id="inmateList_next"]') nextpage.click() logger.info('clicking next page...') store_source = browser.page_source if store_source in pages: finished = True else: pages.append(store_source) logger.info('Storing page _%s_', len(pages)) ## Code to save a page and log appropriately save_pages_array(pages, roster_row) #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger """ Looks like page redicrects you to : http://24.142.144.46:8080/TDT_Viewer/actions/FilterInmateInformation.do with a different setup """ # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) pages = [] #Wait time.sleep(np.random.uniform(5, 10, 1)) #Open roster in new tab roster_link = browser.find_element_by_xpath( '//*[@id="content"]/div[2]/p/strong/a') roster_link.click() #Wait time.sleep(np.random.uniform(3, 6, 1)) #Close current tab, switch to newly opened tab browser.close() browser.switch_to.window(browser.window_handles[-1]) inmate_info = browser.find_element_by_xpath( '//*[@id="InmateInfoMenuHeader"]') inmate_info.click() #Wait time.sleep(np.random.uniform(3, 6, 1)) #Extract the HTML store_source = browser.page_source pages.append(store_source) finished = False while finished == False: try: next_page = browser.find_element_by_xpath( '//*[@id="nextPageButton"]') next_page.click() except: finished = True #Wait time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source #Check whether current page has already been stored; if not, store if store_source in pages: finished = True else: pages.append(store_source) ## Code to save a page and log appropriately save_pages_array(pages, roster_row) #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: """ https://bcsheriff.dyndns.org:8443/TDT_Viewer/ "Your connection is not private Attackers might be trying to steal your information from bcsheriff.dyndns.org (for example, passwords, messages, or credit cards). Learn more NET::ERR_CERT_INVALID" """ logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) browser.get(urlAddress) #Use elements like below to find xpath keys and click through pages = [] #Mouse over menu for link to roster: time.sleep(np.random.uniform(5,10,1)) menu = browser.find_element_by_xpath('//*[@id="art-main"]/nav/ul/li[6]/a') action = ActionChains(browser) action.move_to_element(menu).perform() roster_link = browser.find_element_by_xpath('//*[@id="art-main"]/nav/ul/li[6]/ul/li/a') roster_link.click() #Wait time.sleep(np.random.uniform(5,10,1)) inmate_info = browser.find_element_by_xpath('//*[@id="InmateInfoMenuHeader"]') inmate_info.click() #Wait time.sleep(np.random.uniform(5,10,1)) #Extract the HTML store_source = browser.page_source pages.append(store_source) finished = False while finished == False: try: next_page = browser.find_element_by_xpath('//*[@id="nextPageButton"]') next_page.click() logger.info('clicking next page...') except: finished = True #Wait time.sleep(np.random.uniform(5,10,1)) store_source = browser.page_source #Check whether current page has already been stored; if not, store if store_source in pages: finished = True else: pages.append(store_source) logger.info('appending page...') save_pages_array(pages, roster_row) #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(7, 10, 1)) #Use elements like below to find xpath keys and click through #Clicking show all elem = browser.find_element_by_xpath('//*[@id="ddDisplayRows"]') elem.click() time.sleep(np.random.uniform(1, 2, 1)) elem = browser.find_element_by_xpath( '//*[@id="ddDisplayRows"]/option[5]') elem.click() time.sleep(np.random.uniform(3, 5, 1)) #Extract the HTML# store_source = browser.page_source ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) num_rows_found = 1000000 rownum = 2 # start on row 2. Row 1 matches this same xpath but is the header store_source = browser.page_source row_sources = [] while rownum < num_rows_found: rows = browser.find_elements_by_xpath( '/html/body/form/div[4]/table/tbody/tr/td[1]/a') num_rows_found = len(rows) logger.info('Found _%s_ total records', num_rows_found) elem = rows[rownum].click() time.sleep(np.random.uniform(1, 5, 1)) row_sources.append(browser.page_source) logger.info('Logged id page _%s_', len(row_sources)) browser.execute_script("window.history.go(-1)") time.sleep(np.random.uniform(1, 5, 1)) rownum += 1 save_to_s3(store_source, "MAINPAGE", roster_row) logger.info('Saved page _%s_', page_index) save_pages_array(row_sources, roster_row) #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def jailinmates_aspx(roster_row): try: logger = get_logger(roster_row) # Get a standard logger browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet logger.info('Choosing jailinmates_aspx crawler with url _%s_', urlAddress) if 'jailinmates' not in urlAddress: raise Exception("Appears that this site _%s_ is not a jailinmates URL" % urlAddress) page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages ########## # Begin core specific scraping code browser.get(urlAddress) pages = [] names = [] #Wait time.sleep(np.random.uniform(5,10,1)) #Extract the HTML store_source = browser.page_source soup = BeautifulSoup(store_source, 'lxml') table = soup.find('table', {'class':'p2c-datagrid'}) cells = table.find_all('td') names.append(cells[5].text) pages.append(store_source) logger.info('added page _%s_', names[0]) finished = False while not finished: try: try: nextpage = browser.find_element_by_link_text(str(len(pages)+1)) nextpage.click() time.sleep(np.random.uniform(5,10,1)) store_source = browser.page_source soup = BeautifulSoup(store_source, 'lxml') table = soup.find('table', {'class':'p2c-datagrid'}) cells = table.find_all('td') name = cells[5].text if name not in names: pages.append(store_source) names.append(name) logger.info('added page _%s_', name) else: finished = True except: time.sleep(np.random.uniform(5,10,1)) try: nextpage = browser.find_elements_by_link_text('...')[-1] nextpage.click() store_source = browser.page_source soup = BeautifulSoup(store_source, 'lxml') table = soup.find('table', {'class':'p2c-datagrid'}) cells = table.find_all('td') name = cells[5].text if name not in names: pages.append(store_source) names.append(name) logger.info('added page _%s_', name) else: finished = True except: finished = True except: finished = True ## Code to save a page and log appropriately save_pages_array(pages, roster_row) except Exception as errorMessage: try: record_error(message=str(errorMessage), roster_row=roster_row, page_number_within_scrape=page_index, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) browser.close() # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: """ OLD LINK: https://sandusky-county.com/index.php?page=sheriff DIRECT LINK: https://198.101.49.158:8443/TDT_Viewer/actions/FilterInmateInformation.do """ logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) browser.get(urlAddress) #Create list to store page sources pages = [] #Wait time.sleep(np.random.uniform(5, 10, 1)) """Session expired OR System Exit Called Navigate to Login Screen? Cancel/OK """ #"OK" #bypass = browser.switch_to.alert #bypass.accept() #logger.info('click bypass link') #Wait time.sleep(np.random.uniform(1, 2, 1)) #Navigate to roster roster = browser.find_element_by_xpath( '//*[@id="InmateInfoMenuHeader"]') roster.click() logger.info('click roster link') #Wait time.sleep(np.random.uniform(1, 2, 1)) #Click "search" search_button = browser.find_element_by_xpath('//*[@id="save"]') search_button.click() logger.info('click search') #Wait time.sleep(np.random.uniform(3, 6, 1)) #Extract the HTML store_source = browser.page_source pages.append(store_source) logger.info('visited first page') finished = False while finished == False: try: next_page = browser.find_element_by_xpath( '//*[@id="nextPageButton"]') next_page.click() except: finished = True #Wait time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source #Check whether current page has already been stored; if not, store if store_source in pages: finished = True else: pages.append(store_source) logger.info('saved page _%s_', len(pages)) ## Code to save a page and log appropriately save_pages_array(pages, roster_row) #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) pages = [] suffixes = [] #Extract the HTML req = requests.get(urlAddress) store_source = req.content pages.append(store_source) soup = BeautifulSoup(store_source, 'lxml') nextpage = soup.find_all('a', {'class': 'next page'}) finished = False try: new_suffix = nextpage[0]['href'] except: finished = True while finished == False and new_suffix not in suffixes: newurl = urlAddress + new_suffix logger.info('Found _%s_', newurl) req = requests.get(newurl) suffixes.append(new_suffix) store_source = req.content pages.append(store_source) soup = BeautifulSoup(store_source, 'lxml') nextpage = soup.find_all('a', {'class': 'next page'}) try: new_suffix = nextpage[0]['href'] except: finished = True #Wait time.sleep(np.random.uniform(5, 10, 1)) if len(pages) > 1 and pages[0] == pages[-1]: pages.pop() ## Code to save a page and log appropriately save_pages_array(pages, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) letters = ['A', 'E', 'I', 'N', 'O', 'R', 'U', 'Y'] browser.get(urlAddress) ## Code to save a page and log appropriately #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## num_rows_found = 1000000 rownum = 1 store_source = browser.page_source row_sources = [] while rownum < num_rows_found: rows = browser.find_elements_by_xpath( '/html/body/div/div[2]/div/div[2]/div[2]/div/div/div[2]/div/div/table/tbody/tr' ) num_rows_found = len(rows) logger.info('Found %s profiles', len(rows)) try: link = browser.find_element_by_xpath( '/html/body/div/div[2]/div/div[2]/div[2]/div/div/div[2]/div/div/table/tbody/tr[%s]/td[1]/a' % rownum) except Exception as e: if abs(len(row_sources) - num_rows_found) == 0: break else: raise (e) elem = link.click() time.sleep(np.random.uniform(1, 5, 1)) row_sources.append(browser.page_source) logger.info('Logged id page _%s_', len(row_sources)) browser.execute_script("window.history.go(-1)") time.sleep(np.random.uniform(1, 5, 1)) rownum += 1 save_to_s3(store_source, "MAINPAGE", roster_row) logger.info('Saved page _%s_', page_index) save_pages_array(row_sources, roster_row) #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) browser.get(urlAddress) #Create list to store page sources pages = [] #Wait time.sleep(np.random.uniform(5, 10, 1)) view_current = browser.find_element_by_xpath('//*[@id="btnCurrent"]') view_current.click() #Wait time.sleep(np.random.uniform(5, 10, 1)) #Extract the HTML store_source = browser.page_source pages.append(store_source) logger.info('found page _%s_', len(pages)) finished = False page_index = 1 #Wait time.sleep(np.random.uniform(2, 5, 1)) while finished == False: dropdown = browser.find_element_by_xpath('//*[@id="selGoTo"]') dropdown.click() logger.info('clicking goto dropdown') dropstring = str(page_index * 50 + 1) + '_' logger.info('sending page number _%s_', dropstring) dropdown.send_keys(dropstring) #Wait time.sleep(np.random.uniform(3, 8, 1)) #Extract the HTML store_source = browser.page_source if len(pages) > 1 and store_source in pages: finished = True try: dropdown = browser.find_element_by_xpath( '//*[@id="selGoTo"]') dropdown.send_keys(Keys.ESCAPE) except: pass else: pages.append(store_source) logger.info('found page _%s_', len(pages)) ## Code to save a page and log appropriately save_pages_array(pages, roster_row) #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)