Beispiel #1
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        ##########
        # Begin core specific scraping code

        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.save_single_page(
            roster_row,
            filetype='html')  # try to call a known crawler if possible

        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Beispiel #2
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        #browser = get_browser() # Get a standard browser
        #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.save_single_page(roster_row) # try to call a known crawler if possible
        ## Code to save a page and log appropriately
        #save_to_s3(store_source, page_index, roster_row)
        #logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        browser.close()
        record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Beispiel #3
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.save_single_page(
            roster_row)  # try to call a known crawler if possible
        browser.get(urlAddress)

        #Show all inmates instead of 6 per page
        time.sleep(np.random.uniform(5, 10, 1))
        show_all = browser.find_element_by_xpath('//*[@id="inmatesPerPage"]')
        show_all.send_keys('All')
        logger.info('clicked "All"')

        #Wait
        time.sleep(np.random.uniform(15, 20, 1))

        #Extract the HTML
        store_source = browser.page_source
        ## Code to save a page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)