Ejemplo n.º 1
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        #browser = get_browser() # Get a standard browser
        #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.save_single_page(
            roster_row)  # try to call a known crawler if possible
        ## Code to save a page and log appropriately
        #save_to_s3(store_source, page_index, roster_row)
        #logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        browser.close()
        record_error(message=str(errorMessage),
                     roster_row=roster_row,
                     browser=browser)
        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 2
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'otter tail'
        """
        OLD URL: http://www.co.otter-tail.mn.us/991/In-Custody-List
        NEW URL: https://www.ottertailcounty.us/sheriff/report/custody%20list.rpt.html
        
        """
        time.sleep(np.random.uniform(5, 10, 1))
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 3
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.omsweb_crawler(roster_row)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        browser.close()
        record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 4
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'becker'
        time.sleep(np.random.uniform(5, 10, 1))
        expand = browser.find_element_by_xpath(
            '//*[@id="main-content"]/div[3]/a[1]')
        expand.click()

        #Extract the HTML
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 5
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        ##########
        # Begin core specific scraping code

        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))

        # Extract the HTML using basic_multipage
        crawlers.basic_multipage(
            roster_row,
            next_type="xpath",
            next_string='//*[@id="JailRosterbuttondiv"]/a[8]'
        )  # try to call a known crawler if possible

        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 6
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'morrison'
        """
        Old URL: https://www.co.morrison.mn.us/?SEC=35BA1570-F608-40A9-9571-6968DD357BF6
        New URL: https://incustody.co.morrison.mn.us/
            
        """
        time.sleep(np.random.uniform(5, 10, 1))
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 7
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'california'
        assert roster_row['County'].lower() == 'hayward'

        time.sleep(np.random.uniform(5, 10, 1))
        elem = browser.find_element_by_xpath(
            "//span[contains(text(),'I Agree')]")
        elem.click()
        time.sleep(np.random.uniform(5, 10, 1))
        save_to_s3(browser.page_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 8
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'crow wing'
        #Use elements like below to find xpath keys and click through
        # NOTE: Looks like there's a site problem with a 404. Here's the FAQ:
        # https://www.crowwing.us/Faq.aspx?QID=297 to pointing:
        # https://www.crowwing.us/Faq.aspx?QID=297

        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 9
0
def main(roster_row):
    try:
        """
        OLD URL: https://www.elbertcountysheriff.com/detention/inmateinfo/
        UPDATED URL: https://www.inmateinfo.net/inmateinfo.php?org=ecso
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'colorado'
        assert roster_row['County'].lower() == 'elbert'
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 10
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'faribault'
        """SITE USES IFRAME
        OLD URL: http://www.frcsd.org/index.php?option=com_wrapper&view=wrapper&Itemid=7
        NEW URL: http://www.bevcommasp.com/fcjail/custodylistFar.rpt.html
        """
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 11
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'isanti'
        """SITE USES IFRAME
        ORIGINAL: 'https://www.co.isanti.mn.us/425/In-Custody'
        SOURCE: 'https://sheriff.co.isanti.mn.us/letg/custody.html'
        """
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 12
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.omsweb_crawler(
            roster_row)  # try to call a known crawler if possible

        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 13
0
def save_single_page(roster_row, filetype='html'):
    try:
        logger = get_logger(roster_row) # Get a standard logger
        browser = get_browser() # Get a standard browser
        logger.info('using save_single_html_page for _%s, %s', roster_row['County'], roster_row['State']) # Log the chosen URL

        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL
        #Boilerplate code setting up logger, getting initial URL
        time.sleep(np.random.uniform(5,10,1))

        #Given the urlAddress passed to the function we will navigate to the page
        if filetype=='html':
            browser.get(urlAddress) 
            store_source = browser.page_source
        elif filetype=='xls':
            browser.get(urlAddress) 
            store_source = browser.page_source
        else:
            response = requests.get(urlAddress)
            response.raise_for_status()
            store_source = response.content
        save_to_s3(store_source, page_index, roster_row, filetype=filetype) # Safe result to s3. This call includes logging and file formatting
        logger.info('Saved page _%s_', page_index)
        return True
    except Exception as errorMessage:
        try:
            record_error(message=str(errorMessage), roster_row=roster_row, page_number_within_scrape=page_index, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        browser.close()
        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 14
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'louisiana'
        assert roster_row['County'].lower() == 'natchitoches'
        #Wait
        time.sleep(np.random.uniform(5,10,1))
        #Extract the HTML
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 15
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        page_index = 1

        assert roster_row['State'].lower() == 'alabama'
        assert roster_row['County'].lower() == 'cleburne'

        req = requests.get(urlAddress)
        save_to_s3(req.content, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        store_source = req.content
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 16
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        #browser = get_browser() # Get a standard browser
        #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
        
        logger = get_logger(roster_row) # Get a standard logger
        browser = get_browser() # Get a standard browser
        logger.info('using custom crawler for _%s, %s', roster_row['County'], roster_row['State']) # Log the chosen URL

        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        browser.get(urlAddress)
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL
        
        #Wait
        time.sleep(np.random.uniform(5,10,1))
        
        letters = ['A', 'E', 'I', 'N', 'O', 'R', 'U', 'Y'] 
        pages = []
        indices = []
        
        for letter in letters:
            
            subpages = []
            searchbox = browser.find_element_by_xpath('//*[@id="dnn_ctr1295_ViewInmateSearch_txtLastName"]')
            searchbox.send_keys(Keys.RIGHT, Keys.BACKSPACE)
            searchbox.send_keys(letter, Keys.RETURN)
        
            
            #Wait
            time.sleep(np.random.uniform(5,10,1))
            
            store_source = browser.page_source
            subpages.append(store_source)
            indices.append(letter+'_'+str(len(subpages)))
        
            finished = False
            
            while not finished:
                try:
                    nextpage = browser.find_element_by_partial_link_text('Next Page')
                    nextpage.click()
                    time.sleep(np.random.uniform(10,15,1))
                    store_source = browser.page_source
                    if store_source not in subpages:
                        subpages.append(store_source)
                        indices.append(letter+'_'+str(len(subpages)))
                    else:
                        finished = True
                    
                except:
                    finished = True
                    
            for element in subpages:
                pages.append(element)

        for store_source, page_index in zip(pages, indices):
            save_to_s3(store_source, page_index, roster_row)
            logger.info('Saved page _%s_', page_index)

        ## Code to save a page and log appropriately
        #save_to_s3(store_source, page_index, roster_row)
        #logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 17
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        
        browser = get_browser() # Get a standard browser
        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ####################################
        
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
       
        # Open Browser
        browser.get(urlAddress)
        time.sleep(np.random.uniform(7,10,1))
        
        #Extract the HTML#
        store_source = browser.page_source

        ## Code to save a page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        
        #Finding the last page
        page=0
        soup = BeautifulSoup(store_source, 'lxml')
        try:
            x=soup.findAll("span", {"class":"k-pager-info k-label"})
            x=list(x)
            x=x[1]
            page=str(x)
            page=page[int(page.index("of"))+3:page.index(" items</span>")]
            page=int(page)
            page=int(page/10)+(page % 10 > 0)
        except ValueError as errorMessage:
            page=0
            print("Please review script for this county.")
            try:
                browser.close()
                record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
            except:
                record_error(message=str(errorMessage), roster_row=roster_row)
                
            # Record error in S3 for a general error
            logger.error('Error: %s', errorMessage)
            # Log error
            sys.exit(1)

        #Crawling through all the pages
        string = str(1)
        for i in range(2,page+1):
            elem = browser.find_element_by_xpath('//*[@id="allInmatesGrid"]/div[1]/a[3]/span')
            elem.click()        
            time.sleep(np.random.uniform(5,7,1))
            store_source = browser.page_source
            string = str(i)
            ## Code to save a page and log appropriately
            page_index = int(string) - 1
            save_to_s3(store_source, page_index, roster_row)
            logger.info('Saved page _%s_', page_index)

        # End core specific scraping code
        
        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 18
0
def main(roster_row):
    try:
        """
        OLD URL: http://www.calcoso.org/divisions-jail-inmate-roster/
        UPDATED URL: https://www.calcoso.org/inmate-roster/
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'alabama'
        assert roster_row['County'].lower() == 'lamar'

        letters = ['A', 'E', 'I', 'N', 'O', 'R', 'U', 'Y']
        #Extract the HTML
        page_index = 1
        #Create empty list to store page sources:
        pages = []

        #Create empty list to store index of {letter}_{pagenumber}
        letters_pages = []

        for letter in letters:
            browser.get(urlAddress)
            #Use elements like below to find xpath keys and click through
            #Click I agree to terms
            time.sleep(np.random.uniform(5, 10, 1))
            searchbox = browser.find_element_by_xpath(
                '/html/body/div/div/div/div[2]/div[2]/div[2]/form/div[2]/input'
            )
            searchbox.send_keys(letter)
            searchbutton = browser.find_element_by_xpath(
                '/html/body/div/div/div/div[2]/div[2]/div[2]/form/div[4]/button'
            )
            searchbutton.click()

            #Wait
            time.sleep(np.random.uniform(5, 10, 1))

            #Default variables for entry with no navigation bar:
            split_pages = False
            page_index = 1
            finished = False

            #Check for navigation bar:
            try:
                nextpage = browser.find_element_by_link_text('Next →')
                '/html/body/div/div/div/div[2]/div[3]/div[12]/ul/li[7]/a'
                split_pages = True
            except:
                pass

            page_name = letter + '_{}'.format(page_index)
            #Save main page
            save_to_s3(browser.page_source, page_name, roster_row)
            logger.info('Saved page _%s_', page_name)

            #Perform subroutine if multiple pages are present
            if split_pages == True:
                while finished == False:
                    try:
                        nextpage = browser.find_element_by_link_text('Next →')
                        nextpage.click()
                        page_index += 1

                        #Wait
                        time.sleep(np.random.uniform(5, 10, 1))

                        #Extract the HTML
                        page_name = letter + '_{}'.format(page_index)
                        save_to_s3(browser.page_source, page_name, roster_row)
                        logger.info('Saved page _%s_', page_name)
                        pages.append(store_source)
                    except:
                        finished = True

        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 19
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        # browser = get_browser() # Get a standard browser

        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ####################################

        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))

        ##### We will use requests for this county, not selenium #####
        r = requests.get(urlAddress)
        store_source = r.content
        time.sleep(np.random.uniform(5, 7, 1))

        # Formatting as beautiful soup
        soup = BeautifulSoup(store_source, 'lxml')

        # Pulling all the hrefs
        website_link = []
        for link in soup.findAll('a'):
            data_link = link.get('href')
            website_link.append(data_link)
            print(data_link)

        # Removing all the link that doesn't have connection with current arrestLogs
        website_df = pd.DataFrame(website_link)
        website_df.columns = ['link']
        website_df = website_df[website_df.link.str.contains(
            "arrestLogs/current/")]

        # Resetting index
        website_df = website_df.reset_index()
        website_df = website_df.drop(columns=['index'])

        # Pulling the information on the specific link that we parsed (The latest current arrest logs)
        r = requests.get('http://www.honolulupd.org/information/' +
                         website_df["link"].iloc[0])
        time.sleep(np.random.uniform(5, 7, 1))

        # Pulling the content
        store_source = r.content

        ## Code to save a page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)

        # End core specific scraping code

        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 20
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'louisiana'
        assert roster_row['County'].lower() == 'st. charles'
        time.sleep(np.random.uniform(5, 10, 1))
        lastpage = False
        pages = []
        names = []
        #Get first page
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        soup = BeautifulSoup(store_source, 'lxml')
        firstentry = soup.find('td', {'ordered-tag': 'name'})
        names.append(firstentry.text)
        pages.append(store_source)

        while lastpage == False:
            time.sleep(np.random.uniform(5, 10, 1))
            #Navigate to next page
            try:
                nextpage = browser.find_element_by_xpath(
                    '//*[@id="primary-container"]/div/div/div/zt-collectionview/div[1]/div/div[2]/div[1]/button[2]'
                )

                nextpage.click()
            except:
                lastpage = True

            time.sleep(np.random.uniform(5, 10, 1))

            #Extract the HTML
            store_source = browser.page_source
            save_to_s3(store_source, page_index, roster_row)
            logger.info('Saved page _%s_', page_index)
            soup = BeautifulSoup(store_source, 'lxml')
            firstentry = soup.find('td', {'ordered-tag': 'name'})
            if names[-1] == firstentry.text:
                lastpage = True
            else:
                pages.append(store_source)
                names.append(firstentry.text)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 21
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        
        browser = get_browser() # Get a standard browser
        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ####################################
        
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
       
        # Open Browser
        browser.get(urlAddress)
        time.sleep(np.random.uniform(7,10,1))
        
        num_rows_found = 1000000
        rownum = 1
        store_source = browser.page_source
        row_sources = []
        row_texts = []
        while rownum < num_rows_found:
            # Clicking to show all options. The page actually is buggy.
            # If you click on detail, and click "back", then there is
            # only 10 rows. And clicking "All" again doesn't help. YOu
            # have to click a different number of rows first...
            elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select')
            elem.click()      
            time.sleep(np.random.uniform(1,2,1))
            
            elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select/option[3]')
            elem.click()      
            time.sleep(np.random.uniform(1,2,1))

            elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select')
            elem.click()      
            time.sleep(np.random.uniform(1,2,1))
            
            elem = browser.find_element_by_xpath('//*[@id="pager_center"]/table/tbody/tr/td[5]/select/option[5]')
            elem.click()      
            time.sleep(np.random.uniform(1,2,1))
            
            elem = browser.find_element_by_xpath('//*[@id="refresh_tblII"]/div/span')
            elem.click()      
            time.sleep(np.random.uniform(1,2,1))
            
            #initial_rows = browser.find_elements_by_xpath('/html/body/form/table/tbody/tr[2]/td/table/tbody/tr/td[2]/div/div[2]/div[2]/div/div[3]/div[3]/div/table/tbody/tr') 
            #Extract the HTML#
            rows = browser.find_elements_by_xpath('/html/body/form/table/tbody/tr[2]/td/table/tbody/tr/td[2]/div/div[2]/div[2]/div/div[3]/div[3]/div/table/tbody/tr') 
            logger.info('found %s rows on parse', len(rows))
            if num_rows_found == 1000000:
                num_rows_found = len(rows)
                logger.info('Found _%s_ total records', num_rows_found)

            row_texts.append(rows[rownum].text)
            elem = rows[rownum].click()
            time.sleep(np.random.uniform(1,5,1))
            row_sources.append(browser.page_source)
            logger.info('Logged id page _%s_', len(row_sources))
            browser.execute_script("window.history.go(-1)")
            time.sleep(np.random.uniform(1,5,1))
            rownum += 1
        save_to_s3(store_source, "MAINPAGE", roster_row)
        logger.info('Saved page _%s_', page_index)
        save_pages_array(row_sources, roster_row)



        ## Code to save a page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        
        # End core specific scraping code
        
        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 22
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'california'
        assert roster_row['County'].lower() == 'lake'

        pages = []

        #Use elements like below to find xpath keys and click through
        #Click I agree to terms
        time.sleep(np.random.uniform(20, 30, 1))

        #Extract the HTML
        is_name = browser.find_element_by_xpath(
            '//*[@id="MainContent_gvIncustody_Label3_0"]')
        store_source = browser.page_source
        pages.append(store_source)
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        page_index = 2
        finished = False
        while not finished:
            try:
                nextpage = browser.find_element_by_link_text(str(page_index))
                nextpage.click()
                time.sleep(np.random.uniform(5, 10, 1))
                store_source = browser.page_source
                if store_source not in pages:
                    pages.append(store_source)
                    save_to_s3(store_source, page_index, roster_row)
                    logger.info('Saved page _%s_', page_index)
                else:
                    finished = True
                page_index += 1
            except:
                finished = True
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 23
0
def main(roster_row):
    try:
        """
        IFRAME SITE:
        
        OLD URL: https://www.craigheadso.org/roster_custom.php
        NEW URL: https://www.myr2m.com/craigheadroster/
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'arkansas'
        assert roster_row['County'].lower() == 'craighead'

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)
        #Use elements like below to find xpath keys and click through
        #Click I agree to terms
        time.sleep(np.random.uniform(20, 30, 1))
        pages = []
        page_size = browser.find_element_by_xpath(
            '//*[@id="ContentPlaceHolder1_ddlPageSize"]')
        page_size.send_keys('30', Keys.RETURN)

        time.sleep(np.random.uniform(20, 30, 1))

        #Extract the HTML
        store_source = browser.page_source
        pages.append(store_source)
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        page_index += 1
        finished = False
        while not finished:
            next_page = browser.find_element_by_xpath(
                '//*[@id="ContentPlaceHolder1_cmdNext2"]')
            try:
                next_page.click()
            except:
                pass
            time.sleep(np.random.uniform(20, 30, 1))
            store_source = browser.page_source
            if store_source not in pages:
                pages.append(store_source)
                save_to_s3(store_source, page_index, roster_row)
                logger.info('Saved page _%s_', page_index)
                page_index += 1
            else:
                finished = True
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 24
0
def main(roster_row):
    try:
        """
        OLD URL: https://www.inmateaid.com/inmate-profile-search
        UPDATED URL: https://chargesandbonds.arapahoegov.com/
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'colorado'
        assert roster_row['County'].lower() == 'arapahoe'
        pages = []
        letters = [
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
        ]
        for letter in letters:
            #Search for last names starting with selected letter
            lastname = browser.find_element_by_xpath(
                '//*[@id="ContentPlaceHolder1_txtLast"]')
            lastname.send_keys(letter)

            time.sleep(np.random.uniform(2, 5, 1))

            search = browser.find_element_by_xpath(
                '//*[@id="ContentPlaceHolder1_btnSearchEn"]')
            search.click()

            #Store first page per letter
            store_source = browser.page_source
            page_index = 1
            pagename = letter + '_' + str(page_index)
            pages.append(store_source)
            save_to_s3(store_source, pagename, roster_row)
            logger.info('Saved page _%s_', pagename)

            finished = False

            #Iterate over second through last pages.
            #Stored page will be added to collection if not already in it.
            #Else, the next letter will be called.
            while not finished:
                page_index += 1
                try:
                    nextpage = browser.find_element_by_xpath(
                        '//*[@id="ContentPlaceHolder1_btnNext"]')
                    nextpage.click()
                    time.sleep(np.random.uniform(5, 10, 1))
                    store_source = browser.page_source
                    if store_source not in pages:
                        pages.append(store_source)
                        pagename = letter + '_' + str(page_index)
                        save_to_s3(store_source, pagename, roster_row)
                        logger.info('Saved page _%s_', pagename)
                        letters_pages.append(pagename)
                    else:
                        finished = True
                except:
                    finished = True
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Ejemplo n.º 25
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function

        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ####################################

        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))

        # Open Browser
        browser.get(urlAddress)
        time.sleep(np.random.uniform(7, 10, 1))

        #Assume there is a second page
        more_results = True

        #While a second page exists, click the "load more" button
        while more_results == True:
            try:
                browser.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                load_more = browser.find_element_by_xpath(
                    '//*[@id="LoadMoreButton"]/p[1]')
                load_more.click()
                time.sleep(np.random.uniform(5, 10, 1))

            except:
                more_results = False

        finished = False

        while not finished:
            try:
                expandable = browser.find_element_by_xpath(
                    "//td[contains(text(),'[+]')]")
            except:
                finished = True
            time.sleep(np.random.uniform(0.5, 1, 1))
            try:
                expandable.click()
            except:
                pass
            time.sleep(np.random.uniform(0.5, 1, 1))

        # Extract the HTML
        store_source = browser.page_source

        ## Code to save a page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)

        # End core specific scraping code

        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 26
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function

        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ####################################

        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))

        # Open Browser
        browser.get(urlAddress)
        time.sleep(np.random.uniform(7, 10, 1))

        #Click I Agree
        try:
            elem = browser.find_element_by_xpath('//*[@id="submit2"]')
            elem.click()
            time.sleep(np.random.uniform(2, 4, 1))

        except NoSuchElementException:
            time.sleep(np.random.uniform(2, 4, 1))

        #Extract the HTML#
        store_source = browser.page_source

        #Select institution (CLAYTON COUNTY JAIL)
        soup = BeautifulSoup(store_source, 'lxml')
        nameList = soup.findAll("select", {"name": "vCurrentInstitution"})
        for i in nameList:
            nameList = str(i)
            nameList = re.sub('</option>', "", nameList)
            nameList = re.sub('</select>', "", nameList)
            nameList = re.sub('"', "", nameList)
            nameList = re.sub('>', " ", nameList)
            nameList = nameList.split("<option value=")

        regex = re.compile("^CLAYTON COUNTY JAIL")
        list_index = [
            i for i, item in enumerate(nameList) if re.search(regex, item)
        ]
        list_index = int(list_index[0])

        #Click institution
        elem = browser.find_element_by_xpath('//*[@id="vCurrentInstitution"]')
        elem.click()
        time.sleep(np.random.uniform(2, 4, 1))

        #Click the name of the jail
        elem = browser.find_element_by_xpath(
            '//*[@id="vCurrentInstitution"]/option[' + str(list_index + 1) +
            ']')
        elem.click()
        time.sleep(np.random.uniform(2, 4, 1))

        #Click Submit Form
        elem = browser.find_element_by_xpath('//*[@id="NextButton2"]')
        elem.click()
        time.sleep(np.random.uniform(4, 6, 1))

        # Extract the HTML
        store_source = browser.page_source

        ## Code to save the first page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)

        #Finding the last page
        soup = BeautifulSoup(store_source, 'lxml')
        page = 0
        for link in soup.findAll("span", {"class": "oq-nav-btwn"}):
            page = str(link.text)
            page = re.sub("Page 1 of ", "", page)
            page = int(page)

        #Crawling through all the pages
        string = str(1)
        try:
            for i in range(2, page + 1):
                elem = browser.find_element_by_xpath('//*[@id="oq-nav-nxt"]')
                elem.click()
                time.sleep(np.random.uniform(5, 7, 1))
                store_source = browser.page_source
                string = str(i)
                ## Code to save a page and log appropriately
                page_index = int(string) - 1
                save_to_s3(store_source, page_index, roster_row)
                logger.info('Saved page _%s_', page_index)
        except NoSuchElementException as errorMessage:
            print("Please review this crawler")
            time.sleep(np.random.uniform(2, 4, 1))
            try:
                browser.close()
                record_error(message=str(errorMessage),
                             roster_row=roster_row,
                             browser=browser)
            except:
                record_error(message=str(errorMessage), roster_row=roster_row)

            # Record error in S3 for a general error
            logger.error('Error: %s', errorMessage)
            # Log error
            sys.exit(1)

        # End core specific scraping code

        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 27
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function

        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ####################################

        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))

        # Open Browser
        browser.get(urlAddress)
        time.sleep(np.random.uniform(7, 10, 1))

        #Click to select all the display options
        elem = browser.find_element_by_xpath(
            '//*[@id="pager_center"]/table/tbody/tr/td[5]/select')
        elem.click()
        time.sleep(np.random.uniform(2, 4, 1))

        #Click to display all
        elem = browser.find_element_by_xpath(
            '//*[@id="pager_center"]/table/tbody/tr/td[5]/select/option[5]')
        elem.click()
        time.sleep(np.random.uniform(4, 7, 1))

        # Extract the HTML
        store_source = browser.page_source

        ## Code to save a page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)

        # End core specific scraping code

        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 28
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function

        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ####################################

        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)
        time.sleep(np.random.uniform(7, 10, 1))

        #Extract the HTML#
        store_source = browser.page_source

        ## Code to save the first page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)

        #Finding the last page
        soup = BeautifulSoup(store_source, 'lxml')
        page = 0
        for link in soup.findAll("div",
                                 {"class": "loca-search-head text-center"}):
            page = str(link.text)
            page = re.sub(' Results for "_"', "", page)
            page = int(page) / 10
            page = math.ceil(page)

        #Crawling through all the pages
        string = str(1)
        for i in range(2, page + 1):
            if i > 30:
                print("Exceeds 300 inmates")

            elif i == 2:
                elem = browser.find_element_by_xpath(
                    '/html/body/div/div/div/div[2]/div[3]/div[12]/ul/li[3]/a')
                elem.click()
                time.sleep(np.random.uniform(3, 5, 1))
                store_source = browser.page_source
                string = str(i)
                ## Code to save the page and log appropriately
                page_index = int(string) - 1
                save_to_s3(store_source, page_index, roster_row)
                logger.info('Saved page _%s_', page_index)
            elif i == 3:
                elem = browser.find_element_by_xpath(
                    '/html/body/div/div/div/div[2]/div[3]/div[12]/ul/li[4]/a')
                elem.click()
                time.sleep(np.random.uniform(3, 5, 1))
                store_source = browser.page_source
                string = str(i)
                ## Code to save the page and log appropriately
                page_index = int(string) - 1
                save_to_s3(store_source, page_index, roster_row)
                logger.info('Saved page _%s_', page_index)
            elif i == 4:
                elem = browser.find_element_by_xpath(
                    '/html/body/div/div/div/div[2]/div[3]/div[12]/ul/li[5]/a')
                elem.click()
                time.sleep(np.random.uniform(3, 5, 1))
                store_source = browser.page_source
                string = str(i)
                ## Code to save the page and log appropriately
                page_index = int(string) - 1
                save_to_s3(store_source, page_index, roster_row)
                logger.info('Saved page _%s_', page_index)
            elif i >= 5:
                elem = browser.find_element_by_xpath(
                    '/html/body/div/div/div/div[2]/div[3]/div[12]/ul/li[6]/a')
                elem.click()
                time.sleep(np.random.uniform(3, 5, 1))
                store_source = browser.page_source
                string = str(i)
                ## Code to save the page and log appropriately
                page_index = int(string) - 1
                save_to_s3(store_source, page_index, roster_row)
                logger.info('Saved page _%s_', page_index)

        # End core specific scraping code

        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 29
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        
        browser = get_browser() # Get a standard browser
        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ####################################
        
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
       
        # Open Browser
        browser.get(urlAddress)
        time.sleep(np.random.uniform(7,10,1))
        
        #Extract the HTML#
        store_source = browser.page_source

        ## Code to save a page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        
        #Finding the last page
        soup = BeautifulSoup(store_source, 'lxml')
        page=0
        for link in soup.findAll("div", {"id":"ext-comp-1010"}):
            page=str(link.text)
            page=re.sub("of ", "", page)
            page=int(page)

        #Crawling through all the pages
        string = str(1)
        for i in range(2,page+1):
            elem = browser.find_element_by_xpath('//*[@id="ext-gen110"]')
            elem.click()        
            time.sleep(np.random.uniform(7,10,1))
            store_source = browser.page_source
            string=str(i)
            ## Code to save a page and log appropriately
            page_index = int(string)-1
            save_to_s3(store_source, page_index, roster_row)
            logger.info('Saved page _%s_', page_index)
            
        # End core specific scraping code
        
        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Ejemplo n.º 30
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function

        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ####################################

        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))

        # Open Browser
        browser.get(urlAddress)
        time.sleep(np.random.uniform(7, 10, 1))

        #Extract the HTML#
        store_source = browser.page_source

        ## Code to save the first page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)

        # Finding the last page
        soup = BeautifulSoup(store_source, 'lxml')
        page = 0
        for link in soup.findAll("td", {"style": "white-space:nowrap;"}):
            page = str(link.text)
            page = re.sub("Page 1 of ", "", page)
            page = page[:page.index(" ")]
            page = int(page)

        #Crawling through all the pages
        string = str(1)
        if page == 0:
            print("No inmate")
        if page <= 10:
            for i in range(2, page + 1):
                try:
                    elem = browser.find_element_by_xpath(
                        '//*[@id="Content_MainContent_ASPxGridView3_DXPagerTop"]/tbody/tr/td/table/tbody/tr/td['
                        + str((i * 2) + 3) + ']')
                    elem.click()
                    time.sleep(np.random.uniform(5, 7, 1))
                    store_source = browser.page_source
                    string = str(i)
                    ## Code to save a page and log appropriately
                    page_index = int(string) - 1
                    save_to_s3(store_source, page_index, roster_row)
                    logger.info('Saved page _%s_', page_index)
                # If error then webcrawl needs update
                except NoSuchElementException as errorMessage:
                    print("Please review script for this county.")
                    try:
                        browser.close()
                        record_error(message=str(errorMessage),
                                     roster_row=roster_row,
                                     browser=browser)
                    except:
                        record_error(message=str(errorMessage),
                                     roster_row=roster_row)

                    # Record error in S3 for a general error
                    logger.error('Error: %s', errorMessage)
                    # Log error
                    sys.exit(1)

        elif page > 10:
            for i in range(2, page + 1):
                try:
                    elem = browser.find_element_by_xpath(
                        '//*[@id="Content_MainContent_ASPxGridView3_DXPagerTop"]/tbody/tr/td/table/tbody/tr/td[27]/img'
                    )
                    elem.click()
                    time.sleep(np.random.uniform(5, 7, 1))
                    store_source = browser.page_source
                    string = str(i)
                    ## Code to save a page and log appropriately
                    page_index = int(string) - 1
                    save_to_s3(store_source, page_index, roster_row)
                    logger.info('Saved page _%s_', page_index)
                    # If error -> Sometimes the image is not td[27] but td[29]
                except NoSuchElementException:
                    elem = browser.find_element_by_xpath(
                        '//*[@id="Content_MainContent_ASPxGridView3_DXPagerTop"]/tbody/tr/td/table/tbody/tr/td[29]/img'
                    )
                    elem.click()
                    time.sleep(np.random.uniform(5, 7, 1))
                    store_source = browser.page_source
                    string = str(i)
                    ## Code to save a page and log appropriately
                    page_index = int(string) - 1
                    save_to_s3(store_source, page_index, roster_row)
                    logger.info('Saved page _%s_', page_index)

        # End core specific scraping code

        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)