def write_log():
    finished_at = time.strftime("%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    # =============================================================================
    # WRITE RESULTS OF SUCCESS TO LOCAL DB 
    # =============================================================================   
    dbmgr = pyDB(g['DB'])
    dbmgr.write_log(finished_at, None, **g)
Example #2
0
def get_vars():
    dbmgr = pyDB(g['DB'])
    rslt = dbmgr.get_vars(**g)
    # ADD RESULTS FROM GET_VARS CALL TO DICTIONARY (g)
    for r in rslt:
        g[str(r[0])] = str(r[1])
        #print(r)
    print([g])
def script_run():
    # =============================================================================
    # RETURNS LIST OF PACKAGES THAT HAVE ERRORS LOGGED IN THE LOG TABLE
    # - RERUNS ANY PACKAGES FOUND
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""select dat.PKG_NME, dat.VARS from PY_PROCESS_LOG dat,
                (
                    select PKG_NME, max( START_DATETIME ) as START_DATETIME, max( END_DATETIME ) as END_DATETIME from PY_PROCESS_LOG
                    where 1 = 1 and msmt_dte_id = strftime('%Y%m%d',date('now','localtime')) and PKG_NME not like '%BATCH%'
                    group by PKG_NME
                ) list
            where
                1 = 1 and dat.PKG_NME = list.PKG_NME and dat.END_DATETIME = list.END_DATETIME and dat.STATUS = 'ERROR'"""
    rslt = dbmgr.query(q)

    # =============================================================================
    # CREATES A NEW DICTIONARY AND POPULATE WITH QUERY RESULTS
    # =============================================================================
    log_vars = {k: v for k, v in rslt}
    # =============================================================================
    # RUN EACH PACKAGE LISTED IN DICTIONARY
    # =============================================================================
    try:
        for pkg in log_vars:
            #print(pkg + ' corresponds to ' + log_vars[pkg])
            tempDict = log_vars[pkg]
            tempDict = eval(tempDict)  # CONVERTS STRING TO DICTIONARY

            subprocess.call(
                [pyPath,
                 str(tempDict['PKG_PATH'] + r"\\" + pkg + '.py')])
    except:
        # capture a finish time to be entered into the db
        finished_at = time.strftime("%Y-%m-%d %H:%M:%S")
        # =============================================================================
        # WRITE RESULTS OF ERROR TO LOCAL DB
        # =============================================================================
        e = sys.exc_info()
        dbmgr = pyDB(db)
        dbmgr.write_log(finished_at, 'EMAIL ERROR: ' + str(e), **g)
def email_status(step):
    if step == 'START':
        # SUBJECT & RECIPIENTS
        mymail = pyMail(
            g['PKG_NME_PRNT'] + ' - STARTED @ ' +
            time.strftime("%Y-%m-%d %H:%M:%S"), **g)
        # START HTML BODY (GREETING / OPENING LINE OF EMAIL)
        mymail.htmladd('End Of Message')
        # SEND
        mymail.send(**g)
    elif step == 'END':
        # =============================================================================
        # LOOPS THROUGH TABLE LIST AND GENERATES SUMMARY DATA FOR EMAIL
        # =============================================================================
        dbmgr = pyDB(g['DB'])
        q = r"""SELECT msmt_dte_id, count(*) as row_count, round(sum(INDEX_VAL),4) as index_val, round(sum(HIGH_VAL),4) as high_val, round(sum(low_VAL),4) as low_val, round(sum(TTL_MRKT_VAL),4) as ttl_mrkt_val FROM {0} WHERE 1 = 1 GROUP BY msmt_dte_id ORDER BY msmt_dte_id DESC LIMIT 5""".format(
            'PY_COMMODITY_DATA')
        rslt = dbmgr.query(q)
        # =============================================================================
        # EMAIL SUMMARY OF RESULTS TO DISTRIBUTION LIST
        # =============================================================================
        htmlRes = '''<table cellpadding="8" cellspacing="3" border="1">
                    <tr>
                    <th>msmt_date_id</th>
                    <th>row_count</th>
                    <th>index_val</th>
                    <th>high_val</th>
                    <th>low_val</th>
                    <th>ttl_mrkt_val</th>
                    </tr>'''
        for r in rslt:
            htmlRes = htmlRes + '<tr><td>' + str(r[0]) + '</td><td>' + str(
                r[1]) + '</td><td>' + str(r[2]) + '</td><td>' + str(
                    r[3]) + '</td><td>' + str(r[4]) + '</td><td>' + str(
                        r[5]) + '</td></tr>'
        htmlRes = htmlRes + '</table>'
        # SUBJECT & RECIPIENTS
        mymail = pyMail(
            g['PKG_NME_PRNT'] + ' - ENDED @ ' +
            time.strftime("%Y-%m-%d %H:%M:%S"), **g)
        # START HTML BODY (GREETING / OPENING LINE OF EMAIL)
        mymail.htmladd('Scrape has completed for : ' + g['PKG_NME_PRNT'])
        # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED)
        # ADD LINE OF TEXT
        mymail.htmladd('Summary of Scrape for ' + g['PKG_NME_PRNT'])
        # ADD HTML TABLE CONSTRUCTED ABOVE
        mymail.htmladd(htmlRes)
        # SEND
        mymail.send(**g)
    def send(self, **g):
        # =============================================================================
        # ATTEMPT TO SEND EMAIL 3 TIMES BEFORE FAILING WITH ERROR
        # =============================================================================
        for i in range(3):
            try:
                db = self.db
                # =============================================================================
                # CONVERT STRING TO ARRAY FOR EMAIL DISTRIBUTION SEND
                # =============================================================================
                recipients = self.recipients
                recipient_array = []
                # CONVERTS LIST OBJECT TO ARRAY
                for item in recipients.split(','):  # COMMA, OR OTHER
                    recipient_array.append(item)

                msg = MIMEMultipart('alternative')
                msg['From'] = self.sender
                msg['Subject'] = self.subject
                msg['To'] = ", ".join(
                    recipient_array
                )  # TO:MUST BE ARRAY OF THE FORM ['*****@*****.**']
                msg.preamble = "preamble goes here"
                # CHECKS FOR ATTACHEMNTS AND ADDS IF FOUND
                if self.attachments:
                    self.attach(msg)
                # ADD HTML BODY AFTER ATTACHMENTS
                msg.attach(MIMEText(self.htmlbody, 'html'))
                # SEND
                s = smtplib.SMTP('smtp.gmail.com:587')
                s.ehlo()
                s.starttls()
                s.login(self.sender, self.senderpass)
                s.sendmail(self.sender, self.recipients, msg.as_string())
                # TEST
                print(msg)
                s.quit()
                # BREAK FROM LOOP
                break
            except:
                # capture a finish time to be entered into the db
                finished_at = time.strftime("%Y-%m-%d %H:%M:%S")
                # =============================================================================
                # WRITE RESULTS OF ERROR TO LOCAL DB
                # =============================================================================
                e = sys.exc_info()
                dbmgr = pyDB(db)
                dbmgr.write_log(finished_at, 'EMAIL ERROR: ' + str(e), **g)
def email_status(step):
    if step == 'START':
        # SUBJECT & RECIPIENTS
        mymail = pyMail(g['PKG_NME_PRNT'] + ' - STARTED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g)
        # START HTML BODY (GREETING / OPENING LINE OF EMAIL)
        mymail.htmladd('End Of Message')
        # SEND
        mymail.send(**g)
    elif step == 'END':
        # =============================================================================
        # LOOPS THROUGH TABLE LIST AND GENERATES SUMMARY DATA FOR EMAIL
        # =============================================================================   
        dbmgr = pyDB(g['DB'])
        q = r"""SELECT MSMT_DTE_ID, CNTRY_CDE, count(*) as row_count FROM {0} WHERE 1 = 1 AND  msmt_dte_id >= strftime('%Y%m%d', date('now','localtime','-6 day'))
                 GROUP BY MSMT_DTE_ID, CNTRY_CDE
                 ORDER BY CASE CNTRY_CDE WHEN 'AU' THEN 1
                                         WHEN 'NZ' THEN 2
                                         WHEN 'UK' THEN 3
                                         WHEN 'CA' THEN 4
                                         WHEN 'US' THEN 5
                           END
                         , 1 DESC""".format(
            'PY_EMP_TWITTER_DATA')
        rslt = dbmgr.query(q)
        # =============================================================================
        # EMAIL SUMMARY OF RESULTS TO DISTRIBUTION LIST
        # =============================================================================
        htmlRes = '''<table cellpadding="8" cellspacing="3" border="1">
                    <tr>
                    <th>msmt_date_id</th>
                    <th>cntry_cde</th>
                    <th>row_count</th>
                    </tr>'''
        for r in rslt:
            htmlRes = htmlRes + '<tr><td>' + str(r[0]) + '</td><td>' + str(r[1])  + '</td><td>' + str(r[2])  + '</td></tr>'
        htmlRes = htmlRes + '</table>'
        # SUBJECT & RECIPIENTS
        mymail = pyMail(g['PKG_NME_PRNT'] + ' - ENDED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g)
        # START HTML BODY (GREETING / OPENING LINE OF EMAIL)
        mymail.htmladd('Scrape has completed for : ' + g['PKG_NME_PRNT'])
        # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED)
        # ADD LINE OF TEXT
        mymail.htmladd('Summary of Scrape for ' + g['PKG_NME_PRNT'])
        # ADD HTML TABLE CONSTRUCTED ABOVE
        mymail.htmladd(htmlRes)
        # SEND
        mymail.send(**g)
def script_run():
    # =============================================================================
    # CONVERT STR TO LIST TO ARRAY FOR TABLE LIST RUNNING
    # =============================================================================
    pkgs_to_run = g['PKGS_TO_RUN']
    pkgs_to_run_array = []
    # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING
    for item in pkgs_to_run.split(','):  # COMMA, OR OTHER
        pkgs_to_run_array.append(item)
    try:
        # LOOP THROUGH ALL THE PACKAGES/SCRIPTS FOR THIS FOLDER
        for pkg in pkgs_to_run_array:
            subprocess.call([pyPath, str(path + r"\\" + pkg + '.py')])
    except:
        # capture a finish time to be entered into the db
        finished_at = time.strftime("%Y-%m-%d %H:%M:%S")
        # =============================================================================
        # WRITE RESULTS OF ERROR TO LOCAL DB
        # =============================================================================
        e = sys.exc_info()
        dbmgr = pyDB(db)
        dbmgr.write_log(finished_at, 'EMAIL ERROR: ' + str(e), **g)
Example #8
0
def store_tweet(**tweet): # (tweet)
    # =============================================================================
    # WRITE RESULTS OF TWEEP EXTRACT TO LOCAL DB
    # =============================================================================   
    dbmgr = pyDB(g['DB'])
    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, LOCATION, GEO, COORDS, UTC_OFFSET, TIME_ZONE, LANGUAGE, DESCRIPTION, TEXT, USER_NAME, USER_CREATED, RETWEET_COUNT, POLARITY, SUBJECTIVITY) 
        VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', {10}, '{11}', '{12}', '{13}', {14})""".format(
         g['TBL_NME'] #[0]
        ,tweet['MSMT_DTE_ID'] #[1]
        ,tweet['DATA_TYPE'] #[2]
        ,tweet['CNTRY_CDE'] #[3]
        ,tweet['LOCATION'] #[4]
        ,tweet['GEO']  #[5]
        ,tweet['COORDS']  #[6]
        ,tweet['UTC_OFFSET']  #[7]
        ,tweet['TIME_ZONE']  #[8]
        ,tweet['LANGUAGE']  #[9]
        ,tweet['DESCRIPTION'] #[10]        
        ,tweet['TEXT'] #[11]
        ,tweet['USER_NAME'] #[12]
        ,tweet['USER_CREATED'] #[13]
        ,tweet['RETWEET_COUNT'] #[14]
        )
    dbmgr.query(q)
Example #9
0
    def htmlDownloadLink(url, fileSearchStr, linkId, **g):
        # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
        try:
            rndm_sleep = int(g['SLEEP_VAL'])
        except:
            rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
            rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
            rndm_sleep = random.randint(rLow, rHigh)

        try:
            # ================================================================
            # DOWNLOAD FILE FROM PAGE LINK
            # ================================================================
            # add missing support for chrome "send_command"  to selenium webdriver
            # TRY 1 - NOT WORKING
            #                 driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
            #                 params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': g['CONFIG']['DB_DIR'] + '__fx'}}
            #                 command_result = driver.execute("send_command", params)

            # TRY 2 - NOT WORKING - STILL SAVES TO DEFAULT DIRECTORY
            chromeOptions = webdriver.ChromeOptions()
            chromeOptions.add_argument("--start-maximized")
            prefs = {
                "profile.default_content_settings.popups": 0,
                "download.default_directory": r"g['CONFIG']['DB_DIR']" +
                "__fx\\",  # IMPORTANT - ENDING SLASH V IMPORTANT
                "directory_upgrade": True
            }
            chromeOptions.add_experimental_option("prefs", prefs)

            driver = webdriver.Chrome(
                executable_path=str(g['DRVR_PATH'] + '\\' + g['WEB_DRVR_NME']),
                chrome_options=chromeOptions)  #chromeDrvr
            driver.get(url)
            if linkId == '':
                None
            else:
                dlLink = driver.find_element_by_id(linkId).click(
                )  #instantiate a click on the desired page element

            time.sleep(int(rndm_sleep))
            #NOT WORKING - GET STUCK IN ENDLESS LOOP
            #             for file in os.listdir(g['DEFAULT_SYS_DOWNLOAD_PATH']):
            #                 if file.endswith(fileSearchStr + '.crdownload') or file.endswith(fileSearchStr + '.part'):
            #                     while True: # ascii/tick-data-quotes/eurusd/2017/10
            #                         if file.endswith(fileSearchStr + '.crdownload') or file.endswith(fileSearchStr + '.part'):
            #                             time.sleep(10)
            #                         elif file.endswith(fileSearchStr):
            #                             break
            #                         else:
            #                             time.sleep(10)
            #                 else:
            #                     None
            driver.close()
            driver.quit()
            driver.stop_client()

            #return dlLink
        except:
            # capture a finish time to be entered into the db
            finished_at = time.strftime("%Y-%m-%d %H:%M:%S")
            e = sys.exc_info()
            print('ERROR ENCOUNTERED : ' + str(e))
            # =============================================================================
            # WRITE RESULTS OF ERROR TO LOCAL DB
            # =============================================================================
            dbmgr = pyDB(g['DB'])
            dbmgr.write_log(finished_at, 'HTML PASSING ERROR: ' + str(e), **g)
            # =============================================================================
            # EMAIL SUMMARY OF ERROR TO TO DISTRIBUTION LIST
            # =============================================================================
            htmlRes = '''<table cellpadding="8" cellspacing="3" border="3">
                        <tr>
                        <th>msmt_date_id</th>
                        <th>pkg_nme</th>
                        <th>start_datetime</th>
                        <th>end_datetime</th>
                        <th>status</th>
                        </tr>'''
            htmlRes = htmlRes + '<tr><td>' + str(
                g['MSMT_DTE_ID']) + '</td><td>' + str(
                    g['PKG_NME']) + '</td><td>' + str(
                        g['STARTED_AT']) + '</td><td>' + str(
                            finished_at) + '</td><td>' + 'ERROR' + '</td></tr>'
            htmlRes = htmlRes + '</table>'
            # SUBJECT & RECIPIENTS
            mymail = pyMail(
                str(g['PKG_NME']) + ' - ERROR ENCOUNTERED @ ' +
                time.strftime("%Y-%m-%d %H:%M:%S"), **g)
            # START HTML BODY (GREETING / OPENING LINE OF EMAIL).
            mymail.htmladd(
                'A DOWNLOAD LINK ERROR was encountered for package : ' +
                str(g['PKG_NME']))
            # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED)
            # ADD LINE OF TEXT
            mymail.htmladd('Summary of ERROR')
            # ADD HTML TABLE CONSTRUCTED ABOVE
            mymail.htmladd(htmlRes)
            # HEADER FOR ERROR TEXT
            mymail.htmladd('<b><u>ERROR DETAIL</u></b>')
            # ADD FULL ERROR TO BODY OF EMAIL
            mymail.htmladd(str(e).replace('<', '(').replace('>', ')'))
            # SEND
            mymail.send()
            # QUIT EXECUTION OF PYTHON SCRIPT
            quit()
Example #10
0
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow,rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'], #[0]
        g['MSMT_DTE_ID'], #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'], #[3]
        g['SITE_CDE'] #[4]
        ) 
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url,**g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #soup = soup.encode("utf-8","ignore").decode('ascii', 'ignore')
    #print(soup.encode("utf-8","ignore").decode('ascii', 'ignore'))
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes  
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - INDUSTRY COUNT =====================================================================
    for div in soup.find_all('div', class_='jsCustomScrollContent'):
    # =================================================================================================
    # JOBTYPE CLASS ONLY
    # =================================================================================================  
        for div_child in div.find_all('div', id='JobType'):
            # return the section header (facet type) for each of the child elements
            for span in div_child.find_all('span'):
                facet_type = span.text.upper()
                if facet_type:
                    facet_type = facet_type
                else:
                    None            
            for div_data in div_child.find_all('div', class_='refineitem'):
                facet = div_data.find_all('label')                
                # FACET DESC (type desc)
                facet_desc = re.search('>(.*)</label', str(facet[0]))
                facet_desc = str(facet_desc.group(1)).upper()               
                # FACET COUNT
                facet_count = re.search('>(.*)</label', str(facet[1]))
                if facet_count.group(1):
                    facet_count = facet_count.group(1)
                else:
                    facet_count = '0'                
                # REMOVE THE CATCH-ALL IN THE LIST FROM THE INSERT STATEMENT
                if facet_desc == 'ANY':  
                    None
                else:                
                    # =============================================================================
                    # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                    # =============================================================================   
                    dbmgr = pyDB(g['DB'])
                    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                        g['TBL_NME'], #[0]
                        g['MSMT_DTE_ID'], #[1]
                        g['DATA_TYPE'], #[2]
                        g['CNTRY_CDE'], #[3]
                        g['SITE_CDE'], #[4]
                        facet_type, #[5]
                        facet_desc, #[6]
                        facet_count, #[7]
                        g['STARTED_AT'], #[8]
                        '' #[9]
                        )
                    dbmgr.query(q)                    
    # =================================================================================================
    # SALARY CLASS ONLY
    # =================================================================================================                    
        for div_child in div.find_all('div', id='Salary'):
            # RETURN THE SECTION HEADER (FACET TYPE) FOR EACH OF THE CHILD ELEMENTS
            for span in div_child.find_all('span'):
                facet_type = span.text.upper()
                if facet_type:
                    facet_type = facet_type
                else:
                    None            
            for div_data in div_child.find_all('div', class_='refineitem'):
                facet = div_data.find_all('label')                
                # FACET DESC (type desc)
                facet_desc = re.search('>(.*)</label', str(facet[0])) 
                facet_desc = str(facet_desc.group(1)).upper()               
                # FACET COUNT
                facet_count = re.search('>(.*)</label', str(facet[1]))
                if facet_count.group(1):
                    facet_count = facet_count.group(1)
                else:
                    facet_count = '0'
                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================   
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'], #[0]
                    g['MSMT_DTE_ID'], #[1]
                    g['DATA_TYPE'], #[2]
                    g['CNTRY_CDE'], #[3]
                    g['SITE_CDE'], #[4]
                    facet_type, #[5]
                    facet_desc, #[6]
                    facet_count, #[7]
                    g['STARTED_AT'], #[8]
                    '' #[9]
                    )
                dbmgr.query(q) 
    # =================================================================================================
    # MARKETS CLASS ONLY
    # =================================================================================================  
        for div_child in div.find_all('div', id='Markets'):
            # return the section header (facet type) for each of the child elements
            for span in div_child.find_all('span'):
                facet_type = span.text.upper()
                if facet_type:
                    facet_type = facet_type
                else:
                    None            
            for div_data in div_child.find_all('div', class_='refineitem'):
                facet = div_data.find_all('label')                
                # FACET DESC (type desc)
                facet_desc = re.search('>(.*)</label', str(facet[0])) 
                facet_desc = str(facet_desc.group(1)).upper() 
                facet_desc = facet_desc.replace('&AMP;', '&')              
                # FACET COUNT
                facet_count = re.search('>(.*)</label', str(facet[1]))
                if facet_count.group(1):
                    facet_count = facet_count.group(1)
                else:
                    facet_count = '0'                
                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================   
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'], #[0]
                    g['MSMT_DTE_ID'], #[1]
                    g['DATA_TYPE'], #[2]
                    g['CNTRY_CDE'], #[3]
                    g['SITE_CDE'], #[4]
                    facet_type, #[5]
                    facet_desc, #[6]
                    facet_count, #[7]
                    g['STARTED_AT'], #[8]
                    '' #[9]
                    )
                dbmgr.query(q)
    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + 'SITE_LISTING' + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f:  
            f.writelines(str(soup)) 
        f.close() 
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes  
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================            
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'], #[0]
        finished_at, #[1]
        g['CNTRY_CDE'], #[2]
        g['MSMT_DTE_ID'] #[3]
        )
    dbmgr.query(q)
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(str(soup).encode('ascii', 'ignore'))
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================

    # PASS 0 -  TOTAL COUNT =======================================================================
    facet_type = 'TOTAL JOBS'

    for div in soup.find_all('div', class_='counter'):
        i = 0
        for span in div.find_all('span'):

            if i == 0:
                facet_desc = 'ALL JOBS'
            elif i == 1:
                facet_desc = 'ALL COMPANIES'
            else:
                facet_desc = 'NOT CATEGORISED'

            spanval = div.findAll('span')[i]

            txt1 = spanval.text.replace(',', '')
            txt2 = re.findall(r'\d+', txt1)
            facet_count = txt2[0]

            # =============================================================================
            # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
            # =============================================================================
            dbmgr = pyDB(g['DB'])
            q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                g['TBL_NME'],  #[0]
                g['MSMT_DTE_ID'],  #[1]
                g['DATA_TYPE'],  #[2]
                g['CNTRY_CDE'],  #[3]
                g['SITE_CDE'],  #[4]
                facet_type,  #[5]
                facet_desc,  #[6]
                facet_count,  #[7]
                g['STARTED_AT'],  #[8]
                ''  #[9]
            )
            dbmgr.query(q)

            i = i + 1

    # PASS 1 -  REGION COUNT ======================================================================
    for div in soup.find_all('div',
                             id='locationTabContent'):  # LOCATION/REGION
        facet_type = 'REGION'

        for li in div.find_all('li'):
            for a in li.find_all('a', class_='region', href=True):

                facet_desc = a.text.upper().replace('JOBS IN', '').strip()

                for span in li.find_all('span'):
                    txt1 = span.text.replace(',', '')
                    txt2 = re.findall(r'\d+', txt1)
                    facet_count = txt2[0]

                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  #[0]
                    g['MSMT_DTE_ID'],  #[1]
                    g['DATA_TYPE'],  #[2]
                    g['CNTRY_CDE'],  #[3]
                    g['SITE_CDE'],  #[4]
                    facet_type,  #[5]
                    facet_desc,  #[6]
                    facet_count,  #[7]
                    g['STARTED_AT'],  #[8]
                    ''  #[9]
                )
                dbmgr.query(q)

    # PASS 2 -  INDUSTRY COUNT ====================================================================
    for div in soup.find_all('div', id='sectorTabContent'):  # LOCATION/REGION
        facet_type = 'INDUSTRY'

        for li in div.find_all('li'):
            for a in li.find_all('a', href=True):

                facet_desc = a.text.upper().strip()

                for span in li.find_all('span'):
                    txt1 = span.text.replace(',', '')
                    txt2 = re.findall(r'\d+', txt1)
                    facet_count = txt2[0]

                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  #[0]
                    g['MSMT_DTE_ID'],  #[1]
                    g['DATA_TYPE'],  #[2]
                    g['CNTRY_CDE'],  #[3]
                    g['SITE_CDE'],  #[4]
                    facet_type,  #[5]
                    facet_desc,  #[6]
                    facet_count,  #[7]
                    g['STARTED_AT'],  #[8]
                    ''  #[9]
                )
                dbmgr.query(q)

    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g[
            'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(
                ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                  'w+',
                  encoding='utf-8') as f:
            f.writelines(str(soup))
        f.close()
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
Example #12
0
    def htmlPass(url, **g):
        try:
            # ================================================================
            # EXTRACT HTML USING PARSER OR WEB DRIVER CONFIG
            # ================================================================
            if g['USES_WEB_DRVR'] == 'N':
                #requests.get('https://api.github.com/user', auth=('user', 'pass'))
                #headers = {'User-agent': 'Google Chrome'}
                #html = requests.get(url, headers=headers)
                html = Request(url)
                html.add_header = [('User-agent', 'Google Chrome')]
                html = urlopen(html).read()
            elif g['USES_WEB_DRVR'] == 'Y':
                driver = webdriver.Chrome(
                    executable_path=str(g['DRVR_PATH'] + '\\' +
                                        g['WEB_DRVR_NME']))  #chromeDrvr
                driver.get(url)

                # SLEEP REQUIRED DUE TO SEEK TRYING TO REDIRECT PAGE AND MESSING WITH THE CAPTURE OF LINK
                # FORCES A WAIT FOR PAGE TO PROPERLY RENDER BEFORE CAPTURING HTML
                if 'SEEK' in url.upper():
                    time.sleep(
                        10
                    )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE

                html = driver.page_source

                driver.close()
                driver.quit()
                driver.stop_client()

            return html
        except:
            # capture a finish time to be entered into the db
            finished_at = time.strftime("%Y-%m-%d %H:%M:%S")
            e = sys.exc_info()
            print('ERROR ENCOUNTERED : ' + str(e))
            # =============================================================================
            # WRITE RESULTS OF ERROR TO LOCAL DB
            # =============================================================================
            dbmgr = pyDB(g['DB'])
            dbmgr.write_log(finished_at, 'HTML PASSING ERROR: ' + str(e), **g)
            # =============================================================================
            # EMAIL SUMMARY OF ERROR TO TO DISTRIBUTION LIST
            # =============================================================================
            htmlRes = '''<table cellpadding="8" cellspacing="3" border="3">
                        <tr>
                        <th>msmt_date_id</th>
                        <th>pkg_nme</th>
                        <th>start_datetime</th>
                        <th>end_datetime</th>
                        <th>status</th>
                        </tr>'''
            htmlRes = htmlRes + '<tr><td>' + str(
                g['MSMT_DTE_ID']) + '</td><td>' + str(
                    g['PKG_NME']) + '</td><td>' + str(
                        g['STARTED_AT']) + '</td><td>' + str(
                            finished_at) + '</td><td>' + 'ERROR' + '</td></tr>'
            htmlRes = htmlRes + '</table>'
            # SUBJECT & RECIPIENTS
            mymail = pyMail(
                str(g['PKG_NME']) + ' - ERROR ENCOUNTERED @ ' +
                time.strftime("%Y-%m-%d %H:%M:%S"), **g)
            # START HTML BODY (GREETING / OPENING LINE OF EMAIL).
            mymail.htmladd(
                'A HTML PASSING ERROR was encountered for package : ' +
                str(g['PKG_NME']))
            # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED)
            # ADD LINE OF TEXT
            mymail.htmladd('Summary of ERROR')
            # ADD HTML TABLE CONSTRUCTED ABOVE
            mymail.htmladd(htmlRes)
            # HEADER FOR ERROR TEXT
            mymail.htmladd('<b><u>ERROR DETAIL</u></b>')
            # ADD FULL ERROR TO BODY OF EMAIL
            mymail.htmladd(str(e).replace('<', '(').replace('>', ')'))
            # SEND
            mymail.send()
            # QUIT EXECUTION OF PYTHON SCRIPT
            quit()
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - INDUSTRY COUNT =====================================================================
    facet_type = 'INDUSTRY'
    industry = g['INDUSTRY']
    industries_array = []
    # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING
    for item in industry.split(','):  # COMMA, OR OTHER
        industries_array.append(item)
    # LOOP THROUGH ALL THE ITEM IN REGIONS
    for industry in industries_array:
        facet_desc = industry.upper().replace('-JOBS', '')
        facet_desc = facet_desc.replace(r'/', '')

        for i in range(10):
            print("iteration {0} ({1}) starting".format(i, facet_desc))
            while True:
                try:
                    time.sleep(
                        rndm_sleep
                    )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
                    # =============================================================================
                    # PASS URL TO RETURN HTML FROM SITE PAGE
                    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
                    # =============================================================================
                    url = g['URL'] + r'/{}'.format(industry)
                    passedHTML = pyHTMLPass.htmlPass(url, **g)
                    soup = BeautifulSoup(passedHTML, "html.parser")
                    #print(soup)
                    #facet_count = re.search(r'JOBS 1 TO 10 OF(.*?)</DIV>', str(soup).encode("utf-8","ignore").decode('ascii', 'ignore').upper()).group(1)
                    facet_count = re.search(
                        r'PAGE 1 OF(.*?)JOBS</DIV>',
                        str(soup).encode("utf-8", "ignore").decode(
                            'ascii', 'ignore').upper()).group(1)
                    facet_count = int(facet_count.replace(',', ''))
                except:
                    e = sys.exc_info()
                    print("iteration {0} ({1}) failed with error : {2}".format(
                        i, facet_desc, e))
                    continue
                break
            # =============================================================================
            # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
            # =============================================================================
            dbmgr = pyDB(g['DB'])
            q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                g['TBL_NME'],  #[0]
                g['MSMT_DTE_ID'],  #[1]
                g['DATA_TYPE'],  #[2]
                g['CNTRY_CDE'],  #[3]
                g['SITE_CDE'],  #[4]
                facet_type,  #[5]
                facet_desc,  #[6]
                facet_count,  #[7]
                g['STARTED_AT'],  #[8]
                ''  #[9]
            )
            dbmgr.query(q)

            break
        else:
            None

    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[
            'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                  'w+',
                  encoding='utf-8') as f:
            f.writelines(str(soup))
        f.close()
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
Example #14
0
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # COLLECT ALL JOBS RELATED LINKS
    catLinks = []

    for links in soup.find_all('a'):
        full_ref = str(links)
        link_txt = str(links.get('href'))
        if '/JOBS-IN-' in full_ref.upper() and not ('PRIORITY'
                                                    in full_ref.upper()):
            catLinks.append(link_txt)
    #print(catLinks)

    # PASS 1 - INDUSTRY COUNT =====================================================================
    for link in catLinks:
        facet_type = 'INDUSTRY'

        for i in range(10):
            print("iteration {0} ({1}) starting".format(i, link))
            while True:
                try:
                    time.sleep(
                        rndm_sleep
                    )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
                    # =============================================================================
                    # PASS URL TO RETURN HTML FROM SITE PAGE
                    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
                    # =============================================================================
                    url = g['URL'] + link  #.replace(href_search_str, '')
                    passedHTML = pyHTMLPass.htmlPass(url, **g)
                    soup = BeautifulSoup(passedHTML, "html.parser")
                    #print(soup)

                    title_txt = soup.title.string.upper()
                    idx = title_txt.find(' JOBS')
                    facet_desc = title_txt[:idx]
                    #print(facet_desc)

                    for span in soup.find_all('span', id='SearchSummary'):
                        for h1 in span.find_all('h1'):
                            nbr = re.search('COUNT">(.*?)</STRONG>',
                                            str(soup).upper()).group(0)
                            nbr = str(nbr).replace(',', '')
                            nbr = re.findall('\d+', nbr)
                            facet_count = nbr[0]
                            #print(facet_count)
                except:
                    e = sys.exc_info()
                    print("iteration {0} ({1}) failed with error : {2}".format(
                        i, facet_desc, e))
                    continue
                break

            # =============================================================================
            # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
            # =============================================================================
            dbmgr = pyDB(g['DB'])
            q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                g['TBL_NME'],  #[0]
                g['MSMT_DTE_ID'],  #[1]
                g['DATA_TYPE'],  #[2]
                g['CNTRY_CDE'],  #[3]
                g['SITE_CDE'],  #[4]
                facet_type,  #[5]
                facet_desc,  #[6]
                facet_count,  #[7]
                g['STARTED_AT'],  #[8]
                ''  #[9]
            )
            dbmgr.query(q)

            break
        else:
            None

        # =============================================================================
        # WRITE HTML PAGE TO FILE
        # =============================================================================
        if g['WRITE_HTML_TO_FILE'] == 'Y':
            file_name = g['MSMT_DTE_ID'] + '_' + g[
                'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(
                    ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html'
            with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                      'w+',
                      encoding='utf-8') as f:
                f.writelines(str(soup))
            f.close()

    # PASS 2 - TOTAL COUNT ========================================================================
    time.sleep(
        rndm_sleep
    )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL'] + g['URL_PART1']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    facet_type = 'TOTAL'
    facet_desc = 'ALL JOBS'

    nbr = re.search('COUNT">(.*?)</STRONG>', str(soup).upper()).group(0)
    nbr = str(nbr).replace(',', '')
    nbr = re.findall('\d+', nbr)
    facet_count = nbr[0]
    # =============================================================================
    # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        g['DATA_TYPE'],  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE'],  #[4]
        facet_type,  #[5]
        facet_desc,  #[6]
        facet_count,  #[7]
        g['STARTED_AT'],  #[8]
        ''  #[9]
    )
    dbmgr.query(q)
    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g[
            'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(
                ' ', '_').replace('/', '-') + '_' + facet_desc.replace(
                    ' ', '_').replace('/', '-') + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                  'w+',
                  encoding='utf-8') as f:
            f.writelines(str(soup))
        f.close()

    # PASS 3 - REGION COUNT =======================================================================
    facet_type = 'REGION'
    regions = g['REGIONS']
    regions_array = []
    # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING
    for item in regions.split(','):  # COMMA, OR OTHER
        regions_array.append(item)
    # LOOP THROUGH ALL THE ITEM IN REGIONS
    for region in regions_array:
        for i in range(10):
            print("iteration {0} ({1}) starting".format(i, region))
            while True:
                try:
                    time.sleep(
                        rndm_sleep
                    )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
                    # =============================================================================
                    # PASS URL TO RETURN HTML FROM SITE PAGE
                    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
                    # =============================================================================
                    url = g['URL'] + g['URL_PART1'] + g[
                        'URL_PART2'] + '{}'.format(region.replace(' ', '-'))
                    passedHTML = pyHTMLPass.htmlPass(url, **g)
                    soup = BeautifulSoup(passedHTML, "html.parser")
                    #print(soup)
                    facet_desc = str(region.upper())

                    nbr = re.search('COUNT">(.*?)</STRONG>',
                                    str(soup).upper()).group(0)
                    nbr = str(nbr).replace(',', '')
                    nbr = re.findall('\d+', nbr)
                    facet_count = nbr[0]

                except:
                    e = sys.exc_info()
                    print("iteration {0} ({1}) failed with error : {2}".format(
                        i, facet_desc, e))
                    continue
                break
            # =============================================================================
            # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
            # =============================================================================
            dbmgr = pyDB(g['DB'])
            q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                g['TBL_NME'],  #[0]
                g['MSMT_DTE_ID'],  #[1]
                g['DATA_TYPE'],  #[2]
                g['CNTRY_CDE'],  #[3]
                g['SITE_CDE'],  #[4]
                facet_type,  #[5]
                facet_desc,  #[6]
                facet_count,  #[7]
                g['STARTED_AT'],  #[8]
                ''  #[9]
            )
            dbmgr.query(q)

            break
        else:
            None

        # =============================================================================
        # WRITE HTML PAGE TO FILE
        # =============================================================================
        if g['WRITE_HTML_TO_FILE'] == 'Y':
            file_name = g['MSMT_DTE_ID'] + '_' + g[
                'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(
                    ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html'
            with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                      'w+',
                      encoding='utf-8') as f:
                f.writelines(str(soup))
            f.close()
    else:
        None
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
Example #15
0
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - CATEGORY COUNT =====================================================================
    for div in soup.find_all('div', id="category"):
        for div_sub in div.find_all('div'):
            txt = re.findall(r'SECTORS">(.+?)</A>',
                             str(div_sub).upper())  # find any within brackets
            txt = str(txt[0])
            facet_type = txt.upper()
            #print(facet_type)
        for li in div.find_all('li'):
            txt = re.findall(r'JOBS">(.+?)</A>',
                             str(li).upper())  # find any within brackets
            txt = str(txt[0])
            facet_desc = txt.upper().replace('&AMP;', '&')

            find_nbr = re.findall('\([0-9]*\)',
                                  str(li))  # find any within brackets

            if find_nbr:
                find_nbr = str(find_nbr[0])
                facet_count = find_nbr.replace('(', '').replace(')', '')
                facet_count = int(facet_count)
                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  #[0]
                    g['MSMT_DTE_ID'],  #[1]
                    g['DATA_TYPE'],  #[2]
                    g['CNTRY_CDE'],  #[3]
                    g['SITE_CDE'],  #[4]
                    facet_type,  #[5]
                    facet_desc,  #[6]
                    facet_count,  #[7]
                    g['STARTED_AT'],  #[8]
                    ''  #[9]
                )
                dbmgr.query(q)
    # PASS 2 - REGION COUNT =======================================================================
    for div in soup.find_all('div', id="location"):
        for div_sub in div.find_all('div'):
            txt = re.findall(r'LOCATIONS">(.+?)</A>',
                             str(div_sub).upper())  # find any within brackets
            txt = str(txt[0])
            facet_type = txt.upper()
            #print(facet_type)
        for li in div.find_all('li'):
            txt = re.findall(r'">(.+?)</A>',
                             str(li).upper())  # find any within brackets
            txt = str(txt[0])
            facet_desc = txt.upper().replace('&AMP;', '&')

            find_nbr = re.findall('\([0-9]*\)',
                                  str(li))  # find any within brackets
            if find_nbr:
                find_nbr = str(find_nbr[0])
                facet_count = find_nbr.replace('(', '').replace(')', '')
                facet_count = int(facet_count)
                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  #[0]
                    g['MSMT_DTE_ID'],  #[1]
                    g['DATA_TYPE'],  #[2]
                    g['CNTRY_CDE'],  #[3]
                    g['SITE_CDE'],  #[4]
                    facet_type,  #[5]
                    facet_desc,  #[6]
                    facet_count,  #[7]
                    g['STARTED_AT'],  #[8]
                    ''  #[9]
                )
                dbmgr.query(q)
    # PASS 3 - JOB TYPE ===========================================================================
    for div in soup.find_all('div', id="subcategory"):
        for div_sub in div.find_all('div'):
            txt = re.findall(r'TYPES">(.+?)</A>',
                             str(div_sub).upper())  # find any within brackets
            txt = str(txt[0])
            facet_type = txt.upper()
            #print(facet_type)
        for li in div.find_all('li'):
            txt = re.findall(r'JOBS">(.+?)</A>',
                             str(li).upper())  # find any within brackets
            txt = str(txt[0])
            facet_desc = txt.upper().replace('&AMP;', '&')

            find_nbr = re.findall('\([0-9]*\)',
                                  str(li))  # find any within brackets

            if find_nbr:
                find_nbr = str(find_nbr[0])
                facet_count = find_nbr.replace('(', '').replace(')', '')
                facet_count = int(facet_count)
                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  #[0]
                    g['MSMT_DTE_ID'],  #[1]
                    g['DATA_TYPE'],  #[2]
                    g['CNTRY_CDE'],  #[3]
                    g['SITE_CDE'],  #[4]
                    facet_type,  #[5]
                    facet_desc,  #[6]
                    facet_count,  #[7]
                    g['STARTED_AT'],  #[8]
                    ''  #[9]
                )
                dbmgr.query(q)
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #soup = soup.encode("utf-8","ignore").decode('ascii', 'ignore')
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # COLLECT ALL JOBS RELATED LINKS
    regionLinksList = []
    industryLinksList = []
    jobtypeLinksList = []

    # PASS 1 - COLLECT LINKS FOR THE VARIOUS TYPES ====================================
    for ul in soup.find_all('ul', class_='provinceList'):
        for links in ul.find_all('a'):
            link = str(links.get('href'))
            regionLinksList.append(link)
            #print(regionLinksList)

    for ul in soup.find_all('ul', class_='categoryList'):
        for links in ul.find_all('a'):
            link = str(links.get('href'))
            industryLinksList.append(link)
            #print(industryLinksList)

    #for ul in soup.find_all('ul', class_='studentsList'):
    #    for links in ul.find_all('a'):
    #        link = str(links.get('href'))
    #        jobtypeLinksList.append(link)

    # PASS 2 - COLLECT REGION DATA ====================================================
    facet_type = 'REGION'
    for link in regionLinksList:
        time.sleep(
            rndm_sleep
        )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
        # =============================================================================
        # PASS URL TO RETURN HTML FROM SITE PAGE
        # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
        # =============================================================================
        url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link)

        #url = link #.replace(href_search_str, '')
        passedHTML = pyHTMLPass.htmlPass(url, **g)
        soup = BeautifulSoup(passedHTML, "html.parser")
        #print(soup)

        for h1 in soup.find_all('h1'):
            for links in h1.find_all('a'):
                facet_desc = links.text.upper().replace('JOBS', '').strip()
                link = str(links.get('href'))
                # =============================================================================
                # PASS URL TO RETURN HTML FROM SITE PAGE
                # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
                # =============================================================================
                url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link)
                #url = link #.replace(href_search_str, '')
                passedHTML = pyHTMLPass.htmlPass(url, **g)
                soup = BeautifulSoup(passedHTML, "html.parser")

                for div in soup.find_all('div',
                                         class_='ResultText'):  #result-count
                    for span in div.find_all('span',
                                             class_='ResultText-numTotal'):
                        facet_count = span.text  #re.search(r' of(.*?)</strong>',str(strong)).group(1)

                        #facet_count = strong.text.upper()
                        #facet_count = facet_count.split('OF',1)[1]
                        facet_count = facet_count.strip()

                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  #[0]
                    g['MSMT_DTE_ID'],  #[1]
                    g['DATA_TYPE'],  #[2]
                    g['CNTRY_CDE'],  #[3]
                    g['SITE_CDE'],  #[4]
                    facet_type,  #[5]
                    facet_desc,  #[6]
                    facet_count,  #[7]
                    g['STARTED_AT'],  #[8]
                    ''  #[9]
                )
                dbmgr.query(q)
                # =============================================================================
                # WRITE HTML PAGE TO FILE
                # =============================================================================
                if g['WRITE_HTML_TO_FILE'] == 'Y':
                    file_name = g['MSMT_DTE_ID'] + '_' + g[
                        'CNTRY_CDE'] + '_' + g[
                            'SITE_CDE'] + '_' + facet_type.replace(
                                ' ', '_') + '_' + facet_desc.replace(
                                    ' ', '_') + '.html'
                    with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                              'w+',
                              encoding='utf-8') as f:
                        f.writelines(str(soup))
                    f.close()

    # PASS 3 - COLLECT INDUSTRY DATA ==================================================
    facet_type = 'INDUSTRY'
    for link in industryLinksList:
        time.sleep(
            rndm_sleep
        )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
        # =============================================================================
        # PASS URL TO RETURN HTML FROM SITE PAGE
        # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
        # =============================================================================
        url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link)
        #url = link #.replace(href_search_str, '')
        passedHTML = pyHTMLPass.htmlPass(url, **g)
        soup = BeautifulSoup(passedHTML, "html.parser")
        #print(soup)

        for h1 in soup.find_all('h1'):
            for links in h1.find_all('a'):
                facet_desc = links.text.upper().replace('JOBS', '').strip()
                link = str(links.get('href'))
                # =============================================================================
                # PASS URL TO RETURN HTML FROM SITE PAGE
                # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
                # =============================================================================
                url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link)
                #url = link #.replace(href_search_str, '')
                passedHTML = pyHTMLPass.htmlPass(url, **g)
                soup = BeautifulSoup(passedHTML, "html.parser")

                for div in soup.find_all('div', class_='ResultText'):
                    for span in div.find_all('span',
                                             class_='ResultText-numTotal'):
                        facet_count = span.text  #re.search(r' of(.*?)</strong>',str(strong)).group(1)
                        facet_count = facet_count.strip()

                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  #[0]
                    g['MSMT_DTE_ID'],  #[1]
                    g['DATA_TYPE'],  #[2]
                    g['CNTRY_CDE'],  #[3]
                    g['SITE_CDE'],  #[4]
                    facet_type,  #[5]
                    facet_desc,  #[6]
                    facet_count,  #[7]
                    g['STARTED_AT'],  #[8]
                    ''  #[9]
                )
                dbmgr.query(q)
                # =============================================================================
                # WRITE HTML PAGE TO FILE
                # =============================================================================
                if g['WRITE_HTML_TO_FILE'] == 'Y':
                    file_name = g['MSMT_DTE_ID'] + '_' + g[
                        'CNTRY_CDE'] + '_' + g[
                            'SITE_CDE'] + '_' + facet_type.replace(
                                ' ', '_') + '_' + facet_desc.replace(
                                    ' ', '_') + '.html'
                    with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                              'w+',
                              encoding='utf-8') as f:
                        f.writelines(str(soup))
                    f.close()

    # PASS 4 - COLLECT JOBTYPE DATA ===================================================
    facet_type = 'JOB TYPE'
    for link in jobtypeLinksList:
        time.sleep(
            rndm_sleep
        )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
        # =============================================================================
        # PASS URL TO RETURN HTML FROM SITE PAGE
        # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
        # =============================================================================
        url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link)
        #url = link #.replace(href_search_str, '')
        passedHTML = pyHTMLPass.htmlPass(url, **g)
        soup = BeautifulSoup(passedHTML, "html.parser")
        #print(soup)

        for h1 in soup.find_all('h1', class_='sr-search-title'):
            facet_desc = h1.text.upper().replace('JOBS', '').strip()
            #print(facet_desc)

            for div in soup.find_all('div', class_='result-count'):
                for p in div.find_all('p'):
                    facet_count = p.text.upper()
                    facet_count = facet_count.split('OF', 1)[1]
                    facet_count = facet_count.strip()

            # =============================================================================
            # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
            # =============================================================================
            dbmgr = pyDB(g['DB'])
            q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                g['TBL_NME'],  #[0]
                g['MSMT_DTE_ID'],  #[1]
                g['DATA_TYPE'],  #[2]
                g['CNTRY_CDE'],  #[3]
                g['SITE_CDE'],  #[4]
                facet_type,  #[5]
                facet_desc,  #[6]
                facet_count,  #[7]
                g['STARTED_AT'],  #[8]
                ''  #[9]
            )
            dbmgr.query(q)
            # =============================================================================
            # WRITE HTML PAGE TO FILE
            # =============================================================================
            if g['WRITE_HTML_TO_FILE'] == 'Y':
                file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[
                    'SITE_CDE'] + '_' + facet_type.replace(
                        ' ', '_') + '_' + facet_desc.replace(' ',
                                                             '_') + '.html'
                with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                          'w+',
                          encoding='utf-8') as f:
                    f.writelines(str(soup))
                f.close()

    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
Example #17
0
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL'] + g['URL_PART1']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS
    #soup = str(soup)
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - TOTAL COUNT ========================================================================
    facet_type = 'TOTAL'
    facet_desc = 'ALL JOBS'
    nbr = re.search('<title>(.*?)</title>', str(soup.encode("utf-8"))).group(1)
    nbr = str(nbr).replace(',', '')
    nbr = re.findall('\d+', nbr)
    facet_count = nbr[0]
    facet_count = int(facet_count)
    # =============================================================================
    # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        g['DATA_TYPE'],  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE'],  #[4]
        facet_type,  #[5]
        facet_desc,  #[6]
        facet_count,  #[7]
        g['STARTED_AT'],  #[8]
        ''  #[9]
    )
    dbmgr.query(q)
    # PASS 2 - INDUSTRY COUNT =====================================================================
    for ul in soup.find_all('ul', class_='facet'):
        for li in ul.find_all('li'):
            # RETURN THE FACET TEXT (SECTION TITLE)
            facet = li.find(
                'strong'
            )  # ASSUMES THE FIRST ROW OF THE FACET IS THE "TITLE" ROW - BREAKS IF IT ISNT
            if facet:
                facet_type = facet.text.upper()
            else:
                facet_type = facet_type.upper(
                )  # IF NONE IS FOUND, APPLY CURRENT FACET_TYPE VALUE TO NEXT FACET_TYPE VALUE

            facet_desc = li.find('a')

            if facet_desc:  # CHECKS IF THERE IS A RESULT ON THE SEARCH FOR THE "A" ANCHOR (REMOVES THE TITLE OF THE SECTIONS BY DEFAULT - RETURNED ABOVE)
                facet_desc = facet_desc.text.upper()
                facet_desc = re.sub(
                    r"[!@#$']", '',
                    str(facet_desc))  # REMOVES SPECIAL CHARACTERS FROM STRING
                facet_count = li.find('span')
                facet_count = int(facet_count.text.replace(',', ''))
                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  #[0]
                    g['MSMT_DTE_ID'],  #[1]
                    g['DATA_TYPE'],  #[2]
                    g['CNTRY_CDE'],  #[3]
                    g['SITE_CDE'],  #[4]
                    facet_type,  #[5]
                    facet_desc,  #[6]
                    facet_count,  #[7]
                    g['STARTED_AT'],  #[8]
                    ''  #[9]
                )
                dbmgr.query(q)
            else:  # IF NO "A" ANCHOR IS FOUND, IGNORE
                None
    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g[
            'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(
                ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                  'w+',
                  encoding='utf-8') as f:
            f.writelines(str(soup))
        f.close()
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (captr_dte_id = {1} or captr_dte_id <= {2})""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
    )
    dbmgr.query(q)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - COMMODITY DATA =====================================================================
    quandl.ApiConfig.api_key = g['QUANDL_API_KEY']

    #     quandl.get("NASDAQOMX/NQCIGCTR", authtoken="-7YMD_XEY7yvNsYDX92s") # GOLD
    #     quandl.get("NASDAQOMX/NQCISITR", authtoken="-7YMD_XEY7yvNsYDX92s") # SILVER
    #     quandl.get("NASDAQOMX/NQCICUTR", authtoken="-7YMD_XEY7yvNsYDX92s") # COPPER
    #     quandl.get("NASDAQOMX/NQCICLTR", authtoken="-7YMD_XEY7yvNsYDX92s") # CRUDE OIL LIGHT
    #     quandl.get("NASDAQOMX/NQCICBTR", authtoken="-7YMD_XEY7yvNsYDX92s") # CRUDE OIL BRENT
    #     quandl.get("NASDAQOMX/NQCINGTR", authtoken="-7YMD_XEY7yvNsYDX92s") # NATURAL GAS
    #     quandl.get("NASDAQOMX/NQCIKCTR", authtoken="-7YMD_XEY7yvNsYDX92s") # COFFEE
    #     quandl.get("NASDAQOMX/NQCICCTR", authtoken="-7YMD_XEY7yvNsYDX92s") # COCOA

    for key, value in g['CMDTY'].items():
        try:
            data_type = key
            # RETURN DATAFRAME
            dat = quandl.get(value, authtoken=g['QUANDL_API_KEY'], rows=5)

            for index, row in dat.iterrows():
                #print( index, row[0], row[1], row[2], row[3], row[4])
                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, TRADE_DT, INDEX_VAL, HIGH_VAL, LOW_VAL, TTL_MRKT_VAL, DIV_MRKT_VAL, CAPTR_DTE_ID, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', {9}, '{10}', '{11}')""".format(
                    g['TBL_NME'],  #[0]
                    index.strftime('%Y%m%d'),  #[1]  time.strftime('%Y%m%d')
                    data_type,  #[2]
                    index.strftime('%Y-%m-%d'),  #[3]
                    row[0],  #[4]
                    row[1],  #[5]
                    row[2],  #[6]
                    row[3],  #[7]
                    row[4],  #[8]
                    g['MSMT_DTE_ID'],  #[9]
                    g['STARTED_AT'],  #[10]
                    ''  #[11]
                )
                dbmgr.query(q)
        except:
            # capture a finish time to be entered into the db
            finished_at = time.strftime("%Y-%m-%d %H:%M:%S")
            # =============================================================================
            # WRITE RESULTS OF ERROR TO LOCAL DB
            # =============================================================================
            e = sys.exc_info()
            dbmgr = pyDB(g['DB'])
            dbmgr.write_log(finished_at, 'QUANDL API ERROR: ' + str(e), **g)

    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE captr_dte_id = {2}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['MSMT_DTE_ID']  #[2]
    )
    dbmgr.query(q)
Example #19
0
    def tweepySearch(searchQuery, sinceId, sentmnt_mtch, place_id, cc, **g):
        # SETUP TWITTER AUTHORISATION
        auth = tweepy.AppAuthHandler(g['TWIT_CNSMR_KEY'],
                                     g['TWIT_CNSMR_SECRET'])
        api = tweepy.API(auth,
                         wait_on_rate_limit=True,
                         wait_on_rate_limit_notify=True)

        if (not api):
            print('UNABLE TO AUTHENTICATE')
            sys.exit(-1)

        # IF RESULTS ONLY BELOW A SPECIFIC ID ARE, SET MAX_ID TO THAT ID.
        # ELSE DEFAULT TO NO UPPER LIMIT, START FROM THE MOST RECENT TWEET MATCHING THE SEARCH QUERY.
        max_id = -1  #L
        tweetCount = 0

        print("Downloading max {0} tweets".format(g['TWIT_MAX_TWEETS']))

        while tweetCount < int(g['TWIT_MAX_TWEETS']):
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        tweets = api.search(
                            q=searchQuery, count=int(
                                g['TWIT_TWEETS_PER_QRY']))  #q=searchQuery
                    else:
                        tweets = api.search(q=searchQuery,
                                            count=int(
                                                g['TWIT_TWEETS_PER_QRY']),
                                            since_id=sinceId)
                else:
                    if (not sinceId):
                        tweets = api.search(q=searchQuery,
                                            count=int(
                                                g['TWIT_TWEETS_PER_QRY']),
                                            max_id=str(max_id - 1))
                    else:
                        tweets = api.search(q=searchQuery,
                                            count=int(
                                                g['TWIT_TWEETS_PER_QRY']),
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)

                if not tweets:
                    print("No more tweets found")
                    break

                # =============================================================================
                # PROCESS TWEETS COLLECTED FROM THE SEARCH API PROCESS
                # =============================================================================
                for tweet in tweets:
                    # =============================================================================
                    # WRITE RESULTS OF EACH TWEET TO LOCAL DB
                    # =============================================================================
                    created_at = str(tweet.created_at).split(' ')
                    created_at = created_at[0].replace('-', '')
                    #print(tweet.encode('ascii', 'replace').decode("utf-8"))
                    dbmgr = pyDB(g['DB'])
                    q = r"""INSERT INTO {0} (MSMT_DTE_ID, CREATED_AT, TWEET_ID, USER_ID, USER_NAME, USER_SCREEN_NAME, USER_LOCATION, CNTRY_ID, CNTRY_CDE, PLACE_NAME, SENTMT_MATCH, TWEET_TXT, IN_REPLY_TO, RE_TWEETED, PRCES_DTE_ID, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', {3}, {4}, '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}', '{14}', {15}, '{16}', '{17}')""".format(
                        g['TBL_NME'],  #[0]
                        created_at,  #[1]
                        str(tweet.created_at),  #[2]
                        tweet.id,  #[3]
                        tweet.user.id,  #[4]
                        str(
                            tweet.user.name.encode(
                                'ascii', 'replace').decode("utf-8")).replace(
                                    '?',
                                    '').replace("'",
                                                '').rstrip().lstrip(),  #[5]
                        str(
                            tweet.user.screen_name.encode(
                                'ascii', 'replace').decode("utf-8")).replace(
                                    '?',
                                    '').replace("'",
                                                '').rstrip().lstrip(),  #[6]
                        str(
                            tweet.user.location.encode(
                                'ascii', 'replace').decode("utf-8")).replace(
                                    '?', '').replace(
                                        "'",
                                        '').rstrip().lstrip().upper(),  #[7]
                        str(place_id),  #[8]
                        cc,  #[9]
                        str(
                            tweet.place.name.encode(
                                'ascii', 'replace').decode("utf-8")).replace(
                                    '?', '').replace(
                                        "'",
                                        '').rstrip().lstrip().upper(),  #[10]
                        sentmnt_mtch,  #[11]
                        str(
                            tweet.text.encode(
                                'ascii', 'replace').decode("utf-8")).replace(
                                    '?', '').replace("'", '').replace(
                                        '\n', '. ').replace('. . ',
                                                            '. '),  #[12]
                        ('NOVAL'
                         if tweet.in_reply_to_status_id_str is None else str(
                             tweet.in_reply_to_status_id_str).upper()),  #[13]
                        str(tweet.retweeted).upper(),  #[14]
                        g['MSMT_DTE_ID'],  #[15]
                        g['STARTED_AT'],  #[16]
                        ''  #[17]
                    )
                    #print(q)
                    dbmgr.query(q)

                tweetCount += len(tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = tweets[-1].id

            except tweepy.TweepError as e:
                # capture a finish time to be entered into the db
                finished_at = time.strftime("%Y-%m-%d %H:%M:%S")
                e = sys.exc_info()
                print('ERROR ENCOUNTERED : ' + str(e))
                # =============================================================================
                # WRITE RESULTS OF ERROR TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                dbmgr.write_log(finished_at,
                                'TWITTER SEARCH ERROR : ' + str(e), **g)
                # =============================================================================
                # EMAIL SUMMARY OF ERROR TO TO DISTRIBUTION LIST
                # =============================================================================
                htmlRes = '''<table cellpadding="8" cellspacing="3" border="3">
                            <tr>
                            <th>msmt_date_id</th>
                            <th>pkg_nme</th>
                            <th>start_datetime</th>
                            <th>end_datetime</th>
                            <th>status</th>
                            </tr>'''
                htmlRes = htmlRes + '<tr><td>' + str(
                    g['MSMT_DTE_ID']
                ) + '</td><td>' + str(g['PKG_NME']) + '</td><td>' + str(
                    g['STARTED_AT']) + '</td><td>' + str(
                        finished_at) + '</td><td>' + 'ERROR' + '</td></tr>'
                htmlRes = htmlRes + '</table>'
                # SUBJECT & RECIPIENTS
                mymail = pyMail(
                    str(g['PKG_NME']) + ' - ERROR ENCOUNTERED @ ' +
                    time.strftime("%Y-%m-%d %H:%M:%S"), **g)
                # START HTML BODY (GREETING / OPENING LINE OF EMAIL).
                mymail.htmladd(
                    'A TWITTER SEARCH ERROR was encountered for package : ' +
                    str(g['PKG_NME']))
                # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED)
                # ADD LINE OF TEXT
                mymail.htmladd('Summary of ERROR')
                # ADD HTML TABLE CONSTRUCTED ABOVE
                mymail.htmladd(htmlRes)
                # HEADER FOR ERROR TEXT
                mymail.htmladd('<b><u>ERROR DETAIL</u></b>')
                # ADD FULL ERROR TO BODY OF EMAIL
                mymail.htmladd(str(e).replace('<', '(').replace('>', ')'))
                # SEND
                mymail.send()
                # QUIT EXECUTION OF PYTHON SCRIPT

                # EXIT IF ANY ERROR
                print("some error : " + str(e))
                break

        print("Downloaded {0} tweets".format(tweetCount))
Example #20
0
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - INDUSTRY COUNT =====================================================================
    facet_type = 'INDUSTRY'
    for div in soup.find_all('div', class_="content-holder container"):
        for a in div.find_all('a'):
            for span in a.find_all('span'):
                if 'TITLE' in str(span).upper():
                    txt = span.text.upper().replace(r"'", '')
                elif 'COUNT' in str(span).upper():
                    nbr = re.findall('\d+', span.text)

            facet_desc = txt
            facet_count = int(str(nbr[0]))
            # =============================================================================
            # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
            # =============================================================================
            dbmgr = pyDB(g['DB'])
            q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                g['TBL_NME'],  #[0]
                g['MSMT_DTE_ID'],  #[1]
                g['DATA_TYPE'],  #[2]
                g['CNTRY_CDE'],  #[3]
                g['SITE_CDE'],  #[4]
                facet_type,  #[5]
                facet_desc,  #[6]
                facet_count,  #[7]
                g['STARTED_AT'],  #[8]
                ''  #[9]
            )
            dbmgr.query(q)

    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[
            'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                  'w+',
                  encoding='utf-8') as f:
            f.writelines(str(soup))
        f.close()
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - INDUSTRY & REGION COUNT ============================================================
    for div in soup.find_all('div', id='ajaxRefineSearch'):
        for ref in div.find_all('div', class_='refineItem'):

            refText = str(ref).upper()
            # FACET TYPE
            if 'LOCATION_STATE' in refText:
                facet_type = 'REGION'
            elif 'EMPLOYMENTTYPE' in refText:
                facet_type = 'JOB TYPE'
            elif 'COMPANYNAME' in refText:
                facet_type = 'COMPANY NAME'
            elif 'JOBCATEGORY' in refText:
                facet_type = 'INDUSTRY'
            elif 'LOCATION_COUNTRY' in refText:
                facet_type = 'LOCATION'
            elif 'SALARYTYPE' in refText:
                facet_type = 'SALARY ESTIMATE'

            # FACET DESCRIPTION
            for links in ref.find_all('a'):
                linkText = links.string.upper()
                facet_desc = linkText

            try:  # IGNORES ENTRIES THAT HAVE NO nbr VAL
                # NUMBER VALUE
                nbr = re.search(r'\((\d+(?:\.\d+)?)\)', refText).group(1)
                nbr = str(nbr).replace(',', '')
                facet_count = nbr
                #facet_count = re.findall('\d+', nbr)

                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  #[0]
                    g['MSMT_DTE_ID'],  #[1]
                    g['DATA_TYPE'],  #[2]
                    g['CNTRY_CDE'],  #[3]
                    g['SITE_CDE'],  #[4]
                    facet_type,  #[5]
                    facet_desc,  #[6]
                    facet_count,  #[7]
                    g['STARTED_AT'],  #[8]
                    ''  #[9]
                )
                dbmgr.query(q)
                # =============================================================================
                # WRITE HTML PAGE TO FILE
                # =============================================================================
                if g['WRITE_HTML_TO_FILE'] == 'Y':
                    file_name = g['MSMT_DTE_ID'] + '_' + g[
                        'CNTRY_CDE'] + '_' + g[
                            'SITE_CDE'] + '_' + facet_type.replace(
                                ' ', '_') + '_' + facet_desc.replace(
                                    ' ', '_') + '.html'
                    with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                              'w+',
                              encoding='utf-8') as f:
                        f.writelines(str(soup))
                    f.close()
            except:
                None
        else:
            None
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
def scrape():
   # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow,rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'], #[0]
        g['MSMT_DTE_ID'], #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'], #[3]
        g['SITE_CDE'] #[4]
        ) 
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url,**g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes  
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - INDUSTRY DETAILS ===================================================================
    for links in soup.find_all('a'):        
        full_ref = str(links)
        link_txt = str(links.get('href'))        
        if 'JOBS AVAILABLE IN' in full_ref.upper():
            facet_type = 'INDUSTRY'
            facet_desc = links.string.upper()            
            link_nbr = re.findall('\d+', full_ref)
            facet_count = ''.join(str(e) for e in link_nbr)
            # =============================================================================
            # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
            # =============================================================================   
            dbmgr = pyDB(g['DB'])
            q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                g['TBL_NME'], #[0]
                g['MSMT_DTE_ID'], #[1]
                g['DATA_TYPE'], #[2]
                g['CNTRY_CDE'], #[3]
                g['SITE_CDE'], #[4]
                facet_type, #[5]
                facet_desc, #[6]
                facet_count, #[7]
                g['STARTED_AT'], #[8]
                '' #[9]
                )
            dbmgr.query(q)
        else:
            None
    # PASS 2 - REGIONAL DETAILS ===================================================================  
    facet_type = 'REGION'
    regions = g['REGIONS']
    regions_array = []
    # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING
    for item in regions.split(','): # COMMA, OR OTHER
        regions_array.append(item)  
    # LOOP THROUGH ALL THE ITEM IN REGIONS
    for region in regions_array:    
        time.sleep(rndm_sleep) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
        # =============================================================================
        # PASS URL TO RETURN HTML FROM SITE PAGE
        # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
        # =============================================================================
        url = g['URL'] + g['URL_PART1'] + '{}'.format(region.replace(' ','+'))
        passedHTML = pyHTMLPass.htmlPass(url,**g)
        soup = BeautifulSoup(passedHTML, "html.parser")
        #print(soup)   
        soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS   
        soup = str(soup)
        # ==========================================================================================================================================================
        # SCRAPE SUB PART - START
        # ==========================================================================================================================================================
        facet_desc = str(region.upper())
        facet_count = re.search('1-10 of(.*?)</p>', soup).group(1) 
        facet_count = facet_count.replace(',','').strip()
        # =============================================================================
        # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
        # =============================================================================   
        dbmgr = pyDB(g['DB'])
        q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
            g['TBL_NME'], #[0]
            g['MSMT_DTE_ID'], #[1]
            g['DATA_TYPE'], #[2]
            g['CNTRY_CDE'], #[3]
            g['SITE_CDE'], #[4]
            facet_type, #[5]
            facet_desc, #[6]
            facet_count, #[7]
            g['STARTED_AT'], #[8]
            '' #[9]
            )
        dbmgr.query(q)
    else:
        None
        # ==========================================================================================================================================================
        # SCRAPE SUB PART - END
        # ==========================================================================================================================================================
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes  
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================            
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'], #[0]
        finished_at, #[1]
        g['CNTRY_CDE'], #[2]
        g['MSMT_DTE_ID'] #[3]
        )
    dbmgr.query(q)
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  # [0]
        g['MSMT_DTE_ID'],  # [1]
        retention_date_id,  # [2]
        g['CNTRY_CDE'],  # [3]
        g['SITE_CDE']  # [4]
        ) 
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    # print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes  
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - TOTAL COUNT ========================================================================
    for div in soup.find_all('div', class_='inner cover'):        
        chk_str = str(div).upper()
        chk_str = chk_str.replace(',', '')        
        nbr = re.search('SEARCH.<B>(\d*)</B>', chk_str).group(1)

        facet_type = 'TOTAL'
        facet_desc = 'ALL JOBS'
        facet_count = int(nbr)
        # =============================================================================
        # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
        # =============================================================================   
        dbmgr = pyDB(g['DB'])
        q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
            g['TBL_NME'],  # [0]
            g['MSMT_DTE_ID'],  # [1]
            g['DATA_TYPE'],  # [2]
            g['CNTRY_CDE'],  # [3]
            g['SITE_CDE'],  # [4]
            facet_type,  # [5]
            facet_desc,  # [6]
            facet_count,  # [7]
            g['STARTED_AT'],  # [8]
            ''  # [9]
            )
        dbmgr.query(q)
    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(' ','_') + '_' + facet_desc.replace(' ','_') + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f:  
            f.writelines(str(soup)) 
        f.close()
    # PASS 2 - ALL OTHER FACETS ===================================================================
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL'] + g['URL_PART1'] 
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    # print(soup)   
    
    for div in soup.find_all('div', class_="results-filter-content"):
        for section in div.find_all('section'):
            # FACET TYPE
            facet_type = section.find('h3')
            facet_type = facet_type.text.upper()
            facet_type = facet_type.replace('HIDE FILTERS', '').replace('DISPLAY FILTERS', '').replace('HELP - EDUCATION OR TRAINING', '').strip()
            if 'REGIONS' in facet_type:
                facet_type = 'REGIONS'
            elif 'CATEGORIES' in facet_type:
                facet_type = 'CATEGORY'
            else:
                facet_type = facet_type
            
            # print(facet_type)
            # FACET DESCRIPTION AND COUNT
            for li in section.find_all('li'):
                txt = li.text
                txt = txt.replace('\\', '~').replace('\n', '~').replace('\r', '~').replace('\t', '~').upper()
                txt = txt.replace('~', '').replace("'", "").strip()
                # print(txt)
                
                # FACET DESCRIPTION ===========================================================
                facet_desc = re.findall(' FOUND(.*)', str(txt))
                facet_desc = cleanhtml(facet_desc[0])

                #facet_desc = str(facet_desc[0]).strip()
                # print(facet_desc)
                
                # FACET COUNT =================================================================
                facet_count = re.findall('(\d*)', txt)
                facet_count = str(facet_count[0])
                facet_count = facet_count.replace(',', '')
                try:
                    facet_count = int(facet_count)
                except:
                    facet_count = 0
                        
                # print(facet_count)
                
                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================   
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  # [0]
                    g['MSMT_DTE_ID'],  # [1]
                    g['DATA_TYPE'],  # [2]
                    g['CNTRY_CDE'],  # [3]
                    g['SITE_CDE'],  # [4]
                    facet_type,  # [5]
                    facet_desc,  # [6]
                    facet_count,  # [7]
                    g['STARTED_AT'],  # [8]
                    ''  # [9]
                    )
                dbmgr.query(q)
    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + 'SITE_LISTING' + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f:  
            f.writelines(str(soup))  
        f.close()            
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes  
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================            
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime("%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  # [0]
        finished_at,  # [1]
        g['CNTRY_CDE'],  # [2]
        g['MSMT_DTE_ID']  # [3]
        )
    dbmgr.query(q)
Example #24
0
def scrape():
   # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow,rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'], #[0]
        g['MSMT_DTE_ID'], #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'], #[3]
        g['SITE_CDE'] #[4]
        ) 
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL'] + '/browse'
    passedHTML = pyHTMLPass.htmlPass(url,**g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes  
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - INDUSTRY DETAILS ===================================================================
    rndm_sleep = random.randint(rLow,rHigh)
    facet_type = 'INDUSTRY'
    link_txt_array = []
    
    for href in soup.find_all('a'):
        if '/BROWSE/' in str(href).upper() and '-JOBS' in str(href).upper(): # and 'LINK-DEFAULT' not in str(href).upper():
            full_ref = str(href)
            link_txt = str(href.get('href'))
            if link_txt.count('/') < 5:
                link_txt_array.append(link_txt.replace('/browse',''))
    #print(link_txt_array)   
            
    for link_txt in link_txt_array:
        # =============================================================================
        # PASS URL TO RETURN HTML FROM SITE PAGE
        # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
        # =============================================================================
        url = link_txt
        passedHTML = pyHTMLPass.htmlPass(url,**g)
        soup = BeautifulSoup(passedHTML, "html.parser")
        #print(soup)
    
        for h1 in soup.find_all('h1'):
            facet_desc = str(h1.text).upper().replace('BROWSE','').replace('IN AUSTRALIA','').strip()
        
        for span in soup.find_all('span', class_='c'):
            facet_count = str(span.text)
            facet_count = int(facet_count.replace(',',''))
        
        # =============================================================================
        # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
        # =============================================================================   
        dbmgr = pyDB(g['DB'])
        q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
            g['TBL_NME'], #[0]
            g['MSMT_DTE_ID'], #[1]
            g['DATA_TYPE'], #[2]
            g['CNTRY_CDE'], #[3]
            g['SITE_CDE'], #[4]
            facet_type, #[5]
            facet_desc, #[6]
            facet_count, #[7]
            g['STARTED_AT'], #[8]
            '' #[9]
            )
        dbmgr.query(q)
        # =============================================================================
        # WRITE HTML PAGE TO FILE
        # =============================================================================
        if g['WRITE_HTML_TO_FILE'] == 'Y':
            file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(' ','_').replace('/','-') + '_' + facet_desc.replace(' ','_').replace('/','-') + '.html'
            with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f:  
                f.writelines(str(soup)) 
            f.close()
            
    # PASS 2 - REGIONAL DETAILS ===================================================================  
    facet_type = 'REGION'
    regions = g['REGIONS']
    regions_array = []
    # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING
    for item in regions.split(','): # COMMA, OR OTHER
        regions_array.append(item)  
    # LOOP THROUGH ALL THE ITEM IN REGIONS
    for region in regions_array:    
        time.sleep(rndm_sleep) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
        # =============================================================================
        # PASS URL TO RETURN HTML FROM SITE PAGE
        # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
        # =============================================================================
        url = g['URL'] + g['URL_PART1'] + '{}'.format(region.replace(' ','+'))
        passedHTML = pyHTMLPass.htmlPass(url,**g)
        soup = BeautifulSoup(passedHTML, "html.parser")
        #print(soup)   
        soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS   
        soup = str(soup)
        # ==========================================================================================================================================================
        # SCRAPE SUB PART - START
        # ==========================================================================================================================================================
        facet_desc = str(region.upper())
        facet_count = re.search('1-10 of(.*?)</p>', soup).group(1) 
        facet_count = facet_count.replace(',','').strip()
        # =============================================================================
        # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
        # =============================================================================   
        dbmgr = pyDB(g['DB'])
        q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
            g['TBL_NME'], #[0]
            g['MSMT_DTE_ID'], #[1]
            g['DATA_TYPE'], #[2]
            g['CNTRY_CDE'], #[3]
            g['SITE_CDE'], #[4]
            facet_type, #[5]
            facet_desc, #[6]
            facet_count, #[7]
            g['STARTED_AT'], #[8]
            '' #[9]
            )
        dbmgr.query(q)
        # =============================================================================
        # WRITE HTML PAGE TO FILE
        # =============================================================================
        if g['WRITE_HTML_TO_FILE'] == 'Y':
            file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(' ','_').replace('/','-') + '_' + facet_desc.replace(' ','_').replace('/','-') + '.html'
            with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f:  
                f.writelines(str(passedHTML)) 
            f.close()
    else:
        None
        # ==========================================================================================================================================================
        # SCRAPE SUB PART - END
        # ==========================================================================================================================================================
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes  
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================            
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'], #[0]
        finished_at, #[1]
        g['CNTRY_CDE'], #[2]
        g['MSMT_DTE_ID'] #[3]
        )
    dbmgr.query(q)
def email_status(step):
    if step == 'START':
        # SUBJECT & RECIPIENTS
        mymail = pyMail(
            g['PKG_NME_PRNT'] + ' : NZ - STARTED @ ' +
            time.strftime("%Y-%m-%d %H:%M:%S"), **g)
        # START HTML BODY (GREETING / OPENING LINE OF EMAIL)
        mymail.htmladd('End Of Message')
        # SEND
        mymail.send(**g)
    elif step == 'END':
        # =============================================================================
        # LOOPS THROUGH TABLE LIST AND GENERATES SUMMARY DATA FOR EMAIL
        # =============================================================================
        dbmgr = pyDB(g['DB'])
        q = r"""SELECT msmt_dte_id, cntry_cde, count( * ) AS row_cnt, sum(facet_cnt) as job_count FROM {0} WHERE cntry_cde = 'NZ' GROUP BY msmt_dte_id, cntry_cde ORDER BY msmt_dte_id DESC LIMIT 5""".format(
            'WEBDATA_JOBADS')
        rslt = dbmgr.query(q)
        # =============================================================================
        # EMAIL SUMMARY OF RESULTS TO DISTRIBUTION LIST
        # =============================================================================
        htmlRes = '''<table cellpadding="8" cellspacing="3" border="1">
                    <tr>
                    <th>msmt_date_id</th>
                    <th>cntry_cde</th>
                    <th>row_cnt</th>
                    <th>job_cnt</th>
                    </tr>'''
        for r in rslt:
            htmlRes = htmlRes + '<tr><td>' + str(r[0]) + '</td><td>' + str(
                r[1]) + '</td><td>' + str(r[2]) + '</td><td>' + str(
                    r[3]) + '</td></tr>'
        htmlRes = htmlRes + '</table>'
        # =============================================================================
        # LOOPS THROUGH TABLE LIST AND GENERATES SECONDARY SUMMARY DATA FOR EMAIL
        # =============================================================================
        dbmgr = pyDB(g['DB'])
        q = r"""select max(MSMT_DTE_ID) as msmt_dte_id, CNTRY_CDE,    SITE_CDE, SUM( CURR_ROW_CNT ) AS CURR_ROW_CNT, SUM( PREV_ROW_CNT ) AS PREV_ROW_CNT,    SUM( CURR_FACET_CNT ) AS CURR_FACET_CNT, SUM( PREV_FACET_CNT ) AS PREV_FACET_CNT
                from 
                    (
                        select msmt_dte_id,    cntry_cde, site_cde,
                           case
                                when MSMT_DTE_ID = strftime(
                                    '%Y%m%d',
                                    date(
                                        'now',
                                        'localtime'
                                    )
                                ) then count(*)
                                else 0
                            end as CURR_ROW_CNT,
                            case
                                when MSMT_DTE_ID = strftime(
                                    '%Y%m%d',
                                    date(
                                        'now',
                                        'localtime',
                                        '-1 day'
                                    )
                                ) then count(*)
                                else 0
                            end as PREV_ROW_CNT,
                            cast(case
                                when MSMT_DTE_ID = strftime(
                                    '%Y%m%d',
                                    date(
                                        'now',
                                        'localtime'
                                    )
                                ) then sum( FACET_CNT )
                                else 0
                            end as INTEGER) as CURR_FACET_CNT,
                            cast(case
                                when MSMT_DTE_ID = strftime(
                                    '%Y%m%d',
                                    date(
                                        'now',
                                        'localtime',
                                        '-1 day'
                                    )
                                ) then sum( FACET_CNT )
                                else 0
                            end as INTEGER) as PREV_FACET_CNT
                        from
                            WEBDATA_JOBADS
                        where
                            1 = 1
                            and cntry_cde = 'NZ'
                            and MSMT_DTE_ID >= strftime(
                                '%Y%m%d',
                                date(
                                    'now',
                                    'localtime',
                                    '-1 day'
                                )
                            )
                        group by
                            msmt_dte_id,
                            cntry_cde,
                            site_cde
                    )
                    group BY
                    CNTRY_CDE,
                    SITE_CDE
                order by
                    1,
                    3""".format('WEBDATA_JOBADS')
        rslt = dbmgr.query(q)
        # =============================================================================
        # EMAIL SUMMARY OF RESULTS TO DISTRIBUTION LIST
        # =============================================================================
        htmlRes2 = '''<table cellpadding="8" cellspacing="3" border="1">
                    <tr>
                    <th>msmt_date_id</th>
                    <th>cntry_cde</th>
                    <th>site_cde</th>
                    <th>curr_row_cnt</th>
                    <th>prev_row_cnt</th>
                    <th>curr_facet_cnt</th>
                    <th>prev_facet_cnt</th>
                    </tr>'''
        for r in rslt:
            htmlRes2 = htmlRes2 + '<tr><td>' + str(r[0]) + '</td><td>' + str(
                r[1]) + '</td><td>' + str(r[2]) + '</td><td>' + str(
                    r[3]) + '</td><td>' + str(r[4]) + '</td><td>' + str(
                        r[5]) + '</td><td>' + str(r[6]) + '</td></tr>'
        htmlRes2 = htmlRes2 + '</table>'
        # SUBJECT & RECIPIENTS
        mymail = pyMail(
            g['PKG_NME_PRNT'] + ' : NZ - ENDED @ ' +
            time.strftime("%Y-%m-%d %H:%M:%S"), **g)
        # START HTML BODY (GREETING / OPENING LINE OF EMAIL)
        mymail.htmladd('Scrape has completed for : ' + g['PKG_NME_PRNT'] +
                       ' : UK')
        # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED)
        # ADD LINE OF TEXT
        mymail.htmladd('Summary of Scrape for ' + g['PKG_NME_PRNT'] + ' : UK')
        # ADD HTML TABLE CONSTRUCTED ABOVE
        mymail.htmladd(htmlRes)
        # ADD LINE OF TEXT
        mymail.htmladd('CURR and PREV days comparison')
        # ADD HTML TABLE CONSTRUCTED ABOVE
        mymail.htmladd(htmlRes2)
        # SEND
        mymail.send(**g)
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - INDUSTRY AND REGION COUNTS =========================================================
    for div in soup.find_all('div', class_='srp-filter-panel--inner'):
        for links in soup.find_all('a'):
            full_ref = str(links)
            link_txt = str(links.get('href'))

            match_pattern = re.search(r'\((.*?)\)', full_ref)

            if 'SRP-LIST-FILTER__ITEM' in str(links).upper(
            ) and match_pattern is not None and '/S-JOBS/C9302' not in str(
                    links).upper():  #and 'AD=OFFERING' in str(links).upper()
                if '/S-JOBS/ACT' in str(links).upper() or '/S-JOBS/NSW' in str(
                        links).upper() or '/S-JOBS/NT' in str(links).upper(
                        ) or '/S-JOBS/QLD' in str(
                            links).upper() or '/S-JOBS/SA' in str(links).upper(
                            ) or '/S-JOBS/TAS' in str(links).upper(
                            ) or '/S-JOBS/VIC' in str(links).upper(
                            ) or '/S-JOBS/WA' in str(links).upper():
                    facet_type = 'REGION'
                elif '/S-JOBS/JOBTYPE' in str(links).upper():
                    facet_type = 'JOBTYPE'
                elif '/S-JOBS/ADVERTISEDBY' in str(links).upper():
                    facet_type = 'ADVERTISED'
                else:
                    facet_type = 'INDUSTRY'

                #<a class="srp-list-filter__item-link link link--no-underline" href="/s-trades-services/c22340?ad=offering">Trades &amp; Services (7,955)</a>
                try:
                    objText = re.search(r'">(.*?)</a>', str(full_ref)).group(1)
                    facet_desc = objText.upper().replace('&AMP;', '&')
                    facet_desc = re.sub(r'\((.*?)\)', "", facet_desc)
                    facet_desc = re.sub('[^A-Za-z0-9&' ' ]', '', facet_desc)
                    facet_desc = facet_desc.strip()

                    facet_count = re.search(r'\((.*?)\)', str(links)).group(1)
                    facet_count = int(facet_count.replace(',', ''))
                    # =============================================================================
                    # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                    # =============================================================================
                    dbmgr = pyDB(g['DB'])
                    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                        g['TBL_NME'],  #[0]
                        g['MSMT_DTE_ID'],  #[1]
                        g['DATA_TYPE'],  #[2]
                        g['CNTRY_CDE'],  #[3]
                        g['SITE_CDE'],  #[4]
                        facet_type,  #[5]
                        facet_desc,  #[6]
                        facet_count,  #[7]
                        g['STARTED_AT'],  #[8]
                        ''  #[9]
                    )
                    #print(q)
                    dbmgr.query(q)

                except ValueError:
                    pass  # it was a string, not an int.
    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[
            'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                  'w+',
                  encoding='utf-8') as f:
            f.writelines(str(soup))
        f.close()
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - TOTAL COUNT ========================================================================
    facet_type = 'TOTAL'
    facet_desc = 'ALL JOBS'

    nbr = re.search('>(.*?)jobs</span>', str(soup)).group(1)
    nbr = str(nbr).replace(',', '')
    nbr = re.findall('\d+', nbr)
    facet_count = nbr[0]
    # =============================================================================
    # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        g['DATA_TYPE'],  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE'],  #[4]
        facet_type,  #[5]
        facet_desc,  #[6]
        facet_count,  #[7]
        g['STARTED_AT'],  #[8]
        ''  #[9]
    )
    dbmgr.query(q)
    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g[
            'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(
                ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                  'w+',
                  encoding='utf-8') as f:
            f.writelines(str(soup))
        f.close()
    # PASS 2 - INDUSTRY COUNT =====================================================================
    for links in soup.find_all('a'):
        link_txt = str(links.get('href'))
        link_nbr = re.findall('\d+', link_txt)
        link_nbr_ = ''.join(str(e) for e in link_nbr)
        if link_nbr_:
            nbr_chk = int(link_nbr_)
        else:
            nbr_chk = 0

        if 'JOBS-IN-' in link_txt.upper():
            facet_type = 'REGION'
        else:
            facet_type = 'INDUSTRY'
        # FINAL ASSIGNMENTS
        facet_desc = links.string.upper()
        facet_desc = re.sub(
            r"[!@#$']", '',
            str(facet_desc))  # removes special characters from string

        if 'JOBS-' in link_txt.upper() and nbr_chk <= g[
                'REGION_CHK_ID']:  # if href matches what is considered relevant, do the following
            time.sleep(
                rndm_sleep
            )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
            # =============================================================================
            # PASS URL TO RETURN HTML FROM SITE PAGE
            # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
            # =============================================================================
            url = g['URL'] + link_txt
            passedHTML = pyHTMLPass.htmlPass(url, **g)
            soup = BeautifulSoup(passedHTML, "html.parser")
            #print(soup)
            #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS
            #soup = str(soup)
            try:
                nbr = re.search(r'1 to(.*?)jobs',
                                str(soup.encode("utf-8"))).group(1)
                facet_count = int(nbr.strip().replace(
                    nbr.strip().rpartition(' ')[0], ''))
            except:
                facet_count = 0
            # =============================================================================
            # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
            # =============================================================================
            dbmgr = pyDB(g['DB'])
            q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                g['TBL_NME'],  #[0]
                g['MSMT_DTE_ID'],  #[1]
                g['DATA_TYPE'],  #[2]
                g['CNTRY_CDE'],  #[3]
                g['SITE_CDE'],  #[4]
                facet_type,  #[5]
                facet_desc,  #[6]
                facet_count,  #[7]
                g['STARTED_AT'],  #[8]
                ''  #[9]
            )
            dbmgr.query(q)
            # =============================================================================
            # WRITE HTML PAGE TO FILE
            # =============================================================================
            if g['WRITE_HTML_TO_FILE'] == 'Y':
                file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[
                    'SITE_CDE'] + '_' + facet_type.replace(
                        ' ', '_') + '_' + facet_desc.replace(' ',
                                                             '_') + '.html'
                with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                          'w+',
                          encoding='utf-8') as f:
                    f.writelines(str(soup))
                f.close()
        else:
            None
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL'] + g['URL_PART1']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - TOTAL COUNT ========================================================================
    facet_type = 'TOTAL'
    facet_desc = 'ALL JOBS'
    nbr = re.search('<title>(.*?)</title>', str(soup.encode("utf-8"))).group(1)
    nbr = str(nbr).replace(',', '')
    nbr = re.findall('\d+', nbr)
    facet_count = nbr[0]
    # =============================================================================
    # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        g['DATA_TYPE'],  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE'],  #[4]
        facet_type,  #[5]
        facet_desc,  #[6]
        facet_count,  #[7]
        g['STARTED_AT'],  #[8]
        ''  #[9]
    )
    dbmgr.query(q)
    # PASS 2 - INDUSTRY COUNT =====================================================================
    for ul in soup.find_all('ul', class_='facet'):
        for li in ul.find_all('li'):
            # return the facet text (section title)
            facet = li.find(
                'strong'
            )  # assumes the first row of the facet is the "title" row - breaks if it isnt
            if facet:
                facet_type = facet.text.upper()
            else:
                facet_type = facet_type.upper(
                )  # if None is found, apply current facet_type value to next facet_type value

            facet_desc = li.find('a')

            if facet_desc:  # checks if there is a result on the search for the "a" anchor (removes the title of the sections by default - returned above)
                try:
                    facet_desc = facet_desc.text.upper()
                    facet_desc = re.sub(r"[!@#$']", '', str(
                        facet_desc))  # removes special characters from string
                    facet_count = li.find('span')
                    facet_count = int(facet_count.text.replace(',', ''))
                    # =============================================================================
                    # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                    # =============================================================================
                    dbmgr = pyDB(g['DB'])
                    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                        g['TBL_NME'],  #[0]
                        g['MSMT_DTE_ID'],  #[1]
                        g['DATA_TYPE'],  #[2]
                        g['CNTRY_CDE'],  #[3]
                        g['SITE_CDE'],  #[4]
                        facet_type,  #[5]
                        facet_desc,  #[6]
                        facet_count,  #[7]
                        g['STARTED_AT'],  #[8]
                        ''  #[9]
                    )
                    dbmgr.query(q)

                except:
                    pass

                else:  # if no "a" anchor is found, ignore
                    None
    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[
            'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                  'w+',
                  encoding='utf-8') as f:
            f.writelines(str(soup))
        f.close()
    # PASS 3 - REGION COUNT =====================================================================
    facet_type = 'REGION'
    regions = g['REGIONS']
    regions_array = []
    # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING
    for item in regions.split(','):  # COMMA, OR OTHER
        regions_array.append(item)
    # LOOP THROUGH ALL THE ITEM IN REGIONS
    for region in regions_array:
        time.sleep(
            rndm_sleep
        )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
        # =============================================================================
        # PASS URL TO RETURN HTML FROM SITE PAGE
        # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
        # =============================================================================
        url = url + g['URL_PART2'] + '{}'.format(region.replace(' ', '+'))
        passedHTML = pyHTMLPass.htmlPass(url, **g)
        soup = BeautifulSoup(passedHTML, "html.parser")
        #print(soup)
        facet_desc = str(region.upper())
        facet_count = re.search(r'10</span> of <span>(.*?)</span>',
                                str(soup.encode("utf-8"))).group(1)
        facet_count = int(facet_count.replace(',', ''))
        # =============================================================================
        # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
        # =============================================================================
        dbmgr = pyDB(g['DB'])
        q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
            g['TBL_NME'],  #[0]
            g['MSMT_DTE_ID'],  #[1]
            g['DATA_TYPE'],  #[2]
            g['CNTRY_CDE'],  #[3]
            g['SITE_CDE'],  #[4]
            facet_type,  #[5]
            facet_desc,  #[6]
            facet_count,  #[7]
            g['STARTED_AT'],  #[8]
            ''  #[9]
        )
        dbmgr.query(q)
        # =============================================================================
        # WRITE HTML PAGE TO FILE
        # =============================================================================
        if g['WRITE_HTML_TO_FILE'] == 'Y':
            file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[
                'SITE_CDE'] + '_' + facet_type.replace(' ', '_').replace(
                    '/', '-') + '_' + facet_desc.replace(' ', '_').replace(
                        '/', '-') + '.html'
            with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                      'w+',
                      encoding='utf-8') as f:
                f.writelines(str(soup))
            f.close()
    else:
        None
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL'] + g['URL_PART1']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    #print(soup)
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - INDUSTRY & REGION ==========================================================
    for div in soup.find_all('div', id="centre_col"):
        for a in div.find_all('a'):
            if r"/IN/" in str(a).upper():
                facet_type = 'REGION'
            else:
                facet_type = 'INDUSTRY'
            dest_url1 = str(a.get('href'))
            time.sleep(
                rndm_sleep
            )  # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE
            # =============================================================================
            # PASS URL TO RETURN HTML FROM SITE PAGE
            # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
            # =============================================================================
            url = g['URL'] + dest_url1
            passedHTML = pyHTMLPass.htmlPass(url, **g)
            soup = BeautifulSoup(passedHTML, "html.parser")
            #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS
            #soup = str(soup) # may need to # repasses text back through beautifulsoup interpreter

            for h1 in soup.find_all('h1'):
                for a in h1.find_all('a'):
                    dest_url2 = str(a.get('href'))
                    txt = a.text.upper()
                    facet_desc = txt.replace(r"'", '').replace('JOBS',
                                                               '').strip()
                    # =============================================================================
                    # PASS URL TO RETURN HTML FROM SITE PAGE
                    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
                    # =============================================================================
                    url = g['URL'] + dest_url2
                    passedHTML = pyHTMLPass.htmlPass(url, **g)
                    soup = BeautifulSoup(passedHTML, "html.parser")
                    #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS
                    #soup = str(soup) # may need to # repasses text back through beautifulsoup interpreter
                    nbr = re.search(
                        r'</SPAN> OF <SPAN>(.*?)</SPAN>',
                        str(soup).encode("utf-8", "ignore").decode(
                            'ascii', 'ignore').upper()).group(1)
                    nbr = str(nbr).replace(',', '')
                    facet_count = int(nbr)
                    # =============================================================================
                    # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                    # =============================================================================
                    dbmgr = pyDB(g['DB'])
                    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                        g['TBL_NME'],  #[0]
                        g['MSMT_DTE_ID'],  #[1]
                        g['DATA_TYPE'],  #[2]
                        g['CNTRY_CDE'],  #[3]
                        g['SITE_CDE'],  #[4]
                        facet_type,  #[5]
                        facet_desc,  #[6]
                        facet_count,  #[7]
                        g['STARTED_AT'],  #[8]
                        ''  #[9]
                    )
                    dbmgr.query(q)
                    # =============================================================================
                    # WRITE HTML PAGE TO FILE
                    # =============================================================================
                    if g['WRITE_HTML_TO_FILE'] == 'Y':
                        file_name = g['MSMT_DTE_ID'] + '_' + g[
                            'CNTRY_CDE'] + '_' + g[
                                'SITE_CDE'] + '_' + facet_type.replace(
                                    ' ', '_').replace(
                                        '/', '-') + '_' + facet_desc.replace(
                                            ' ', '_').replace('/',
                                                              '-') + '.html'
                        with open(g['CONFIG']['DB_DIR'] + '__html\\' +
                                  file_name,
                                  'w+',
                                  encoding='utf-8') as f:
                            f.writelines(str(soup))
                        f.close()
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
Example #30
0
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (captr_dte_id = {1} or captr_dte_id <= {2})""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
    )
    dbmgr.query(q)

    # =============================================================================
    # LOOP THROUGH DATES FOR HISTORICAL SCRAPES (ONLY REQUIRED FOR FIRST RUN)
    #
    # =============================================================================
    # dts = g['MONTH_LIST'] # ONLY NEEDED TO RUN HISTORY
    first_dy_curr_mth = fdttm.today().replace(day=1)

    dts = []
    dts.append((first_dy_curr_mth - datetime.timedelta(days=1)
                ).strftime('%b.%Y').lower())  # PREVIOUS MONTH
    dts.append(time.strftime('%b.%Y').lower())  # CURRENT MONTH
    dts.append((datetime.date.today() + relativedelta.relativedelta(months=1)
                ).strftime('%b.%Y').lower())  # NEXT MONTH X1
    dts.append((datetime.date.today() + relativedelta.relativedelta(months=2)
                ).strftime('%b.%Y').lower())  # NEXT MONTH X2
    #dts.append( (datetime.date.today() + relativedelta.relativedelta(months=3)).strftime('%b.%Y').lower() ) # NEXT MONTH X3  -- CALENDAR DOESNT GO THIS FAR FORWARD

    #dts_array = []
    # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING
    for item in dts:
        # =============================================================================
        # PASS URL TO RETURN HTML FROM SITE PAGE
        # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
        # =============================================================================
        url = g['URL'] + item
        passedHTML = pyHTMLPass.htmlPass(url, **g)
        soup = BeautifulSoup(passedHTML, "html.parser")
        #print(soup)#.encode("utf-8"))

        # ==========================================================================================================================================================
        # SCRAPE PART - START
        # - this should be the primary section of code that changes
        # - only other sections that "may" change are DELETE and UPDATE DB statements
        # ==========================================================================================================================================================

        # GET MONTH AND YEAR FROM HEADER
        for div in soup.find_all('div', class_='head'):
            for span in div.find_all('span'):
                if '<strong>' in str(span).lower():
                    dt_part = span.text.upper().split(' ')
                    annce_mth = dt_part[-2]
                    annce_yr = dt_part[-1]

        for tab in soup.find_all('table', class_='calendar__table'):
            for row in tab.find_all('tr'):
                if 'calendar__row' in str(row).lower(
                ) and 'day-breaker' not in str(row).lower(
                ) and 'calendarexpanded__container' not in str(row).lower():

                    cell_nbr = 1  # INITIALISE CELL NBR TO 1 - DATA WILL BE ASSIGNED BASED ON CELL NBR (POSITION) WHICH SHOULDNT CHANGE

                    for cell in row.find_all('td'):
                        #print(cell)
                        if cell_nbr == 1:  # DATE OF MONTH (IF NOT NULL)
                            try:
                                dt = re.search('<span>(.*)</span>',
                                               str(cell).lower())
                                dt = dt.group(1).replace('</span>', '').upper()

                                dt_part = dt.split(' ')

                                mth_nme = dt_part[0]
                                dy_nbr = dt_part[1]
                                if len(dy_nbr) == 1:
                                    dy_nbr = '0' + str(dy_nbr)
                                else:
                                    dy_nbr = str(dy_nbr)

                                mth_nbr = g['MONTH_NBR_CNVRT'].get(mth_nme)
                                annce_dt = str(annce_yr) + '-' + str(
                                    mth_nbr) + '-' + str(dy_nbr)
                                msmt_dte_id = str(annce_yr) + str(
                                    mth_nbr) + str(dy_nbr)
                            except:
                                annce_dt = annce_dt

                        elif cell_nbr == 2:  # TIME OF DAY (MIGHT BE "ALL DAY" EVENT)
                            if cell.text.strip().upper() != '':
                                annce_tm = cell.text.strip().upper()
                            else:
                                try:
                                    annce_tm = annce_tm
                                except:
                                    annce_tm = ''

                        elif cell_nbr == 3:  # CNTRY CDE
                            try:
                                cntry_cde = cell.text.strip().upper()
                            except:
                                cntry_cde = ''

                        elif cell_nbr == 4:  # IMPACT (LOW / MEDIUM / HIGH)
                            result = cell.find('span')
                            if result is not None:
                                impact = result.get('title')
                                impact = impact.upper().replace(
                                    'IMPACT EXPECTED', '').strip()
                            else:
                                impact = ''

                        elif cell_nbr == 5:  # EVENT DESCRIPTION
                            try:
                                for span in cell.find_all('span'):
                                    event_desc = span.text.strip().upper()
                            except:
                                event_desc = ''

                        elif cell_nbr == 6:  # -- IGNORE --  LINK TO DETAILS
                            pass

                        elif cell_nbr == 7:  # ACTUAL VALUE
                            try:
                                actual_val = cell.text.strip()
                            except:
                                actual_val = ''

                        elif cell_nbr == 8:  # FORECAST VALUE
                            try:
                                forecast_val = cell.text.strip()
                            except:
                                forecast_val = ''

                        elif cell_nbr == 9:  # PREVIOUS VALUE
                            try:
                                previous_val = cell.text.strip()
                            except:
                                previous_val = ''

                        elif cell_nbr == 10:  # -- IGNORE --  LINK TO GRAPH
                            pass

                        else:
                            continue

                        cell_nbr = cell_nbr + 1

                    # GENERATE A CODE FROM THE DESC AND CRNCY
                    annce_cde = pyLIB.codeGen(cntry_cde + ' ' +
                                              event_desc)  # GET CODE

                    # =============================================================================
                    # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                    # =============================================================================
                    dbmgr = pyDB(g['DB'])
                    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, SITE_CDE, ANNCE_DTE, ANNCE_TM, CNTRY_CDE, ANNCE_CDE, ANNCE_DESC, IMPACT, ACTUAL, FORECAST, PREVIOUS, CAPTR_DTE_ID, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', "{8}", '{9}', '{10}', '{11}', '{12}', {13}, '{14}', '{15}')""".format(
                        g['TBL_NME'],  #[0]
                        msmt_dte_id,  #[1]
                        g['DATA_TYPE'],  #[2]
                        g['SITE_CDE'],  #[3]
                        annce_dt,  #[4]
                        annce_tm,  #[5]
                        cntry_cde,  #[6]
                        annce_cde,  #[7]
                        event_desc,  #[8]
                        impact,  #[9]
                        actual_val,  #[10]
                        forecast_val,  #[11]
                        previous_val,  #[12]
                        g['MSMT_DTE_ID'],  #[13]
                        g['STARTED_AT'],  #[14]
                        ''  #[15]
                    )
                    #print(q)
                    dbmgr.query(q)

    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================

    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE captr_dte_id = {2}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['MSMT_DTE_ID']  #[2]
    )
    dbmgr.query(q)