def write_log(): finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db # ============================================================================= # WRITE RESULTS OF SUCCESS TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) dbmgr.write_log(finished_at, None, **g)
def get_vars(): dbmgr = pyDB(g['DB']) rslt = dbmgr.get_vars(**g) # ADD RESULTS FROM GET_VARS CALL TO DICTIONARY (g) for r in rslt: g[str(r[0])] = str(r[1]) #print(r) print([g])
def script_run(): # ============================================================================= # RETURNS LIST OF PACKAGES THAT HAVE ERRORS LOGGED IN THE LOG TABLE # - RERUNS ANY PACKAGES FOUND # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""select dat.PKG_NME, dat.VARS from PY_PROCESS_LOG dat, ( select PKG_NME, max( START_DATETIME ) as START_DATETIME, max( END_DATETIME ) as END_DATETIME from PY_PROCESS_LOG where 1 = 1 and msmt_dte_id = strftime('%Y%m%d',date('now','localtime')) and PKG_NME not like '%BATCH%' group by PKG_NME ) list where 1 = 1 and dat.PKG_NME = list.PKG_NME and dat.END_DATETIME = list.END_DATETIME and dat.STATUS = 'ERROR'""" rslt = dbmgr.query(q) # ============================================================================= # CREATES A NEW DICTIONARY AND POPULATE WITH QUERY RESULTS # ============================================================================= log_vars = {k: v for k, v in rslt} # ============================================================================= # RUN EACH PACKAGE LISTED IN DICTIONARY # ============================================================================= try: for pkg in log_vars: #print(pkg + ' corresponds to ' + log_vars[pkg]) tempDict = log_vars[pkg] tempDict = eval(tempDict) # CONVERTS STRING TO DICTIONARY subprocess.call( [pyPath, str(tempDict['PKG_PATH'] + r"\\" + pkg + '.py')]) except: # capture a finish time to be entered into the db finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # ============================================================================= # WRITE RESULTS OF ERROR TO LOCAL DB # ============================================================================= e = sys.exc_info() dbmgr = pyDB(db) dbmgr.write_log(finished_at, 'EMAIL ERROR: ' + str(e), **g)
def email_status(step): if step == 'START': # SUBJECT & RECIPIENTS mymail = pyMail( g['PKG_NME_PRNT'] + ' - STARTED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g) # START HTML BODY (GREETING / OPENING LINE OF EMAIL) mymail.htmladd('End Of Message') # SEND mymail.send(**g) elif step == 'END': # ============================================================================= # LOOPS THROUGH TABLE LIST AND GENERATES SUMMARY DATA FOR EMAIL # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""SELECT msmt_dte_id, count(*) as row_count, round(sum(INDEX_VAL),4) as index_val, round(sum(HIGH_VAL),4) as high_val, round(sum(low_VAL),4) as low_val, round(sum(TTL_MRKT_VAL),4) as ttl_mrkt_val FROM {0} WHERE 1 = 1 GROUP BY msmt_dte_id ORDER BY msmt_dte_id DESC LIMIT 5""".format( 'PY_COMMODITY_DATA') rslt = dbmgr.query(q) # ============================================================================= # EMAIL SUMMARY OF RESULTS TO DISTRIBUTION LIST # ============================================================================= htmlRes = '''<table cellpadding="8" cellspacing="3" border="1"> <tr> <th>msmt_date_id</th> <th>row_count</th> <th>index_val</th> <th>high_val</th> <th>low_val</th> <th>ttl_mrkt_val</th> </tr>''' for r in rslt: htmlRes = htmlRes + '<tr><td>' + str(r[0]) + '</td><td>' + str( r[1]) + '</td><td>' + str(r[2]) + '</td><td>' + str( r[3]) + '</td><td>' + str(r[4]) + '</td><td>' + str( r[5]) + '</td></tr>' htmlRes = htmlRes + '</table>' # SUBJECT & RECIPIENTS mymail = pyMail( g['PKG_NME_PRNT'] + ' - ENDED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g) # START HTML BODY (GREETING / OPENING LINE OF EMAIL) mymail.htmladd('Scrape has completed for : ' + g['PKG_NME_PRNT']) # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED) # ADD LINE OF TEXT mymail.htmladd('Summary of Scrape for ' + g['PKG_NME_PRNT']) # ADD HTML TABLE CONSTRUCTED ABOVE mymail.htmladd(htmlRes) # SEND mymail.send(**g)
def send(self, **g): # ============================================================================= # ATTEMPT TO SEND EMAIL 3 TIMES BEFORE FAILING WITH ERROR # ============================================================================= for i in range(3): try: db = self.db # ============================================================================= # CONVERT STRING TO ARRAY FOR EMAIL DISTRIBUTION SEND # ============================================================================= recipients = self.recipients recipient_array = [] # CONVERTS LIST OBJECT TO ARRAY for item in recipients.split(','): # COMMA, OR OTHER recipient_array.append(item) msg = MIMEMultipart('alternative') msg['From'] = self.sender msg['Subject'] = self.subject msg['To'] = ", ".join( recipient_array ) # TO:MUST BE ARRAY OF THE FORM ['*****@*****.**'] msg.preamble = "preamble goes here" # CHECKS FOR ATTACHEMNTS AND ADDS IF FOUND if self.attachments: self.attach(msg) # ADD HTML BODY AFTER ATTACHMENTS msg.attach(MIMEText(self.htmlbody, 'html')) # SEND s = smtplib.SMTP('smtp.gmail.com:587') s.ehlo() s.starttls() s.login(self.sender, self.senderpass) s.sendmail(self.sender, self.recipients, msg.as_string()) # TEST print(msg) s.quit() # BREAK FROM LOOP break except: # capture a finish time to be entered into the db finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # ============================================================================= # WRITE RESULTS OF ERROR TO LOCAL DB # ============================================================================= e = sys.exc_info() dbmgr = pyDB(db) dbmgr.write_log(finished_at, 'EMAIL ERROR: ' + str(e), **g)
def email_status(step): if step == 'START': # SUBJECT & RECIPIENTS mymail = pyMail(g['PKG_NME_PRNT'] + ' - STARTED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g) # START HTML BODY (GREETING / OPENING LINE OF EMAIL) mymail.htmladd('End Of Message') # SEND mymail.send(**g) elif step == 'END': # ============================================================================= # LOOPS THROUGH TABLE LIST AND GENERATES SUMMARY DATA FOR EMAIL # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""SELECT MSMT_DTE_ID, CNTRY_CDE, count(*) as row_count FROM {0} WHERE 1 = 1 AND msmt_dte_id >= strftime('%Y%m%d', date('now','localtime','-6 day')) GROUP BY MSMT_DTE_ID, CNTRY_CDE ORDER BY CASE CNTRY_CDE WHEN 'AU' THEN 1 WHEN 'NZ' THEN 2 WHEN 'UK' THEN 3 WHEN 'CA' THEN 4 WHEN 'US' THEN 5 END , 1 DESC""".format( 'PY_EMP_TWITTER_DATA') rslt = dbmgr.query(q) # ============================================================================= # EMAIL SUMMARY OF RESULTS TO DISTRIBUTION LIST # ============================================================================= htmlRes = '''<table cellpadding="8" cellspacing="3" border="1"> <tr> <th>msmt_date_id</th> <th>cntry_cde</th> <th>row_count</th> </tr>''' for r in rslt: htmlRes = htmlRes + '<tr><td>' + str(r[0]) + '</td><td>' + str(r[1]) + '</td><td>' + str(r[2]) + '</td></tr>' htmlRes = htmlRes + '</table>' # SUBJECT & RECIPIENTS mymail = pyMail(g['PKG_NME_PRNT'] + ' - ENDED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g) # START HTML BODY (GREETING / OPENING LINE OF EMAIL) mymail.htmladd('Scrape has completed for : ' + g['PKG_NME_PRNT']) # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED) # ADD LINE OF TEXT mymail.htmladd('Summary of Scrape for ' + g['PKG_NME_PRNT']) # ADD HTML TABLE CONSTRUCTED ABOVE mymail.htmladd(htmlRes) # SEND mymail.send(**g)
def script_run(): # ============================================================================= # CONVERT STR TO LIST TO ARRAY FOR TABLE LIST RUNNING # ============================================================================= pkgs_to_run = g['PKGS_TO_RUN'] pkgs_to_run_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in pkgs_to_run.split(','): # COMMA, OR OTHER pkgs_to_run_array.append(item) try: # LOOP THROUGH ALL THE PACKAGES/SCRIPTS FOR THIS FOLDER for pkg in pkgs_to_run_array: subprocess.call([pyPath, str(path + r"\\" + pkg + '.py')]) except: # capture a finish time to be entered into the db finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # ============================================================================= # WRITE RESULTS OF ERROR TO LOCAL DB # ============================================================================= e = sys.exc_info() dbmgr = pyDB(db) dbmgr.write_log(finished_at, 'EMAIL ERROR: ' + str(e), **g)
def store_tweet(**tweet): # (tweet) # ============================================================================= # WRITE RESULTS OF TWEEP EXTRACT TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, LOCATION, GEO, COORDS, UTC_OFFSET, TIME_ZONE, LANGUAGE, DESCRIPTION, TEXT, USER_NAME, USER_CREATED, RETWEET_COUNT, POLARITY, SUBJECTIVITY) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', {10}, '{11}', '{12}', '{13}', {14})""".format( g['TBL_NME'] #[0] ,tweet['MSMT_DTE_ID'] #[1] ,tweet['DATA_TYPE'] #[2] ,tweet['CNTRY_CDE'] #[3] ,tweet['LOCATION'] #[4] ,tweet['GEO'] #[5] ,tweet['COORDS'] #[6] ,tweet['UTC_OFFSET'] #[7] ,tweet['TIME_ZONE'] #[8] ,tweet['LANGUAGE'] #[9] ,tweet['DESCRIPTION'] #[10] ,tweet['TEXT'] #[11] ,tweet['USER_NAME'] #[12] ,tweet['USER_CREATED'] #[13] ,tweet['RETWEET_COUNT'] #[14] ) dbmgr.query(q)
def htmlDownloadLink(url, fileSearchStr, linkId, **g): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" try: rndm_sleep = int(g['SLEEP_VAL']) except: rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) try: # ================================================================ # DOWNLOAD FILE FROM PAGE LINK # ================================================================ # add missing support for chrome "send_command" to selenium webdriver # TRY 1 - NOT WORKING # driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command') # params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': g['CONFIG']['DB_DIR'] + '__fx'}} # command_result = driver.execute("send_command", params) # TRY 2 - NOT WORKING - STILL SAVES TO DEFAULT DIRECTORY chromeOptions = webdriver.ChromeOptions() chromeOptions.add_argument("--start-maximized") prefs = { "profile.default_content_settings.popups": 0, "download.default_directory": r"g['CONFIG']['DB_DIR']" + "__fx\\", # IMPORTANT - ENDING SLASH V IMPORTANT "directory_upgrade": True } chromeOptions.add_experimental_option("prefs", prefs) driver = webdriver.Chrome( executable_path=str(g['DRVR_PATH'] + '\\' + g['WEB_DRVR_NME']), chrome_options=chromeOptions) #chromeDrvr driver.get(url) if linkId == '': None else: dlLink = driver.find_element_by_id(linkId).click( ) #instantiate a click on the desired page element time.sleep(int(rndm_sleep)) #NOT WORKING - GET STUCK IN ENDLESS LOOP # for file in os.listdir(g['DEFAULT_SYS_DOWNLOAD_PATH']): # if file.endswith(fileSearchStr + '.crdownload') or file.endswith(fileSearchStr + '.part'): # while True: # ascii/tick-data-quotes/eurusd/2017/10 # if file.endswith(fileSearchStr + '.crdownload') or file.endswith(fileSearchStr + '.part'): # time.sleep(10) # elif file.endswith(fileSearchStr): # break # else: # time.sleep(10) # else: # None driver.close() driver.quit() driver.stop_client() #return dlLink except: # capture a finish time to be entered into the db finished_at = time.strftime("%Y-%m-%d %H:%M:%S") e = sys.exc_info() print('ERROR ENCOUNTERED : ' + str(e)) # ============================================================================= # WRITE RESULTS OF ERROR TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) dbmgr.write_log(finished_at, 'HTML PASSING ERROR: ' + str(e), **g) # ============================================================================= # EMAIL SUMMARY OF ERROR TO TO DISTRIBUTION LIST # ============================================================================= htmlRes = '''<table cellpadding="8" cellspacing="3" border="3"> <tr> <th>msmt_date_id</th> <th>pkg_nme</th> <th>start_datetime</th> <th>end_datetime</th> <th>status</th> </tr>''' htmlRes = htmlRes + '<tr><td>' + str( g['MSMT_DTE_ID']) + '</td><td>' + str( g['PKG_NME']) + '</td><td>' + str( g['STARTED_AT']) + '</td><td>' + str( finished_at) + '</td><td>' + 'ERROR' + '</td></tr>' htmlRes = htmlRes + '</table>' # SUBJECT & RECIPIENTS mymail = pyMail( str(g['PKG_NME']) + ' - ERROR ENCOUNTERED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g) # START HTML BODY (GREETING / OPENING LINE OF EMAIL). mymail.htmladd( 'A DOWNLOAD LINK ERROR was encountered for package : ' + str(g['PKG_NME'])) # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED) # ADD LINE OF TEXT mymail.htmladd('Summary of ERROR') # ADD HTML TABLE CONSTRUCTED ABOVE mymail.htmladd(htmlRes) # HEADER FOR ERROR TEXT mymail.htmladd('<b><u>ERROR DETAIL</u></b>') # ADD FULL ERROR TO BODY OF EMAIL mymail.htmladd(str(e).replace('<', '(').replace('>', ')')) # SEND mymail.send() # QUIT EXECUTION OF PYTHON SCRIPT quit()
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow,rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8","ignore").decode('ascii', 'ignore') #print(soup.encode("utf-8","ignore").decode('ascii', 'ignore')) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY COUNT ===================================================================== for div in soup.find_all('div', class_='jsCustomScrollContent'): # ================================================================================================= # JOBTYPE CLASS ONLY # ================================================================================================= for div_child in div.find_all('div', id='JobType'): # return the section header (facet type) for each of the child elements for span in div_child.find_all('span'): facet_type = span.text.upper() if facet_type: facet_type = facet_type else: None for div_data in div_child.find_all('div', class_='refineitem'): facet = div_data.find_all('label') # FACET DESC (type desc) facet_desc = re.search('>(.*)</label', str(facet[0])) facet_desc = str(facet_desc.group(1)).upper() # FACET COUNT facet_count = re.search('>(.*)</label', str(facet[1])) if facet_count.group(1): facet_count = facet_count.group(1) else: facet_count = '0' # REMOVE THE CATCH-ALL IN THE LIST FROM THE INSERT STATEMENT if facet_desc == 'ANY': None else: # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ================================================================================================= # SALARY CLASS ONLY # ================================================================================================= for div_child in div.find_all('div', id='Salary'): # RETURN THE SECTION HEADER (FACET TYPE) FOR EACH OF THE CHILD ELEMENTS for span in div_child.find_all('span'): facet_type = span.text.upper() if facet_type: facet_type = facet_type else: None for div_data in div_child.find_all('div', class_='refineitem'): facet = div_data.find_all('label') # FACET DESC (type desc) facet_desc = re.search('>(.*)</label', str(facet[0])) facet_desc = str(facet_desc.group(1)).upper() # FACET COUNT facet_count = re.search('>(.*)</label', str(facet[1])) if facet_count.group(1): facet_count = facet_count.group(1) else: facet_count = '0' # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ================================================================================================= # MARKETS CLASS ONLY # ================================================================================================= for div_child in div.find_all('div', id='Markets'): # return the section header (facet type) for each of the child elements for span in div_child.find_all('span'): facet_type = span.text.upper() if facet_type: facet_type = facet_type else: None for div_data in div_child.find_all('div', class_='refineitem'): facet = div_data.find_all('label') # FACET DESC (type desc) facet_desc = re.search('>(.*)</label', str(facet[0])) facet_desc = str(facet_desc.group(1)).upper() facet_desc = facet_desc.replace('&', '&') # FACET COUNT facet_count = re.search('>(.*)</label', str(facet[1])) if facet_count.group(1): facet_count = facet_count.group(1) else: facet_count = '0' # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(str(soup).encode('ascii', 'ignore')) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 0 - TOTAL COUNT ======================================================================= facet_type = 'TOTAL JOBS' for div in soup.find_all('div', class_='counter'): i = 0 for span in div.find_all('span'): if i == 0: facet_desc = 'ALL JOBS' elif i == 1: facet_desc = 'ALL COMPANIES' else: facet_desc = 'NOT CATEGORISED' spanval = div.findAll('span')[i] txt1 = spanval.text.replace(',', '') txt2 = re.findall(r'\d+', txt1) facet_count = txt2[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) i = i + 1 # PASS 1 - REGION COUNT ====================================================================== for div in soup.find_all('div', id='locationTabContent'): # LOCATION/REGION facet_type = 'REGION' for li in div.find_all('li'): for a in li.find_all('a', class_='region', href=True): facet_desc = a.text.upper().replace('JOBS IN', '').strip() for span in li.find_all('span'): txt1 = span.text.replace(',', '') txt2 = re.findall(r'\d+', txt1) facet_count = txt2[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 2 - INDUSTRY COUNT ==================================================================== for div in soup.find_all('div', id='sectorTabContent'): # LOCATION/REGION facet_type = 'INDUSTRY' for li in div.find_all('li'): for a in li.find_all('a', href=True): facet_desc = a.text.upper().strip() for span in li.find_all('span'): txt1 = span.text.replace(',', '') txt2 = re.findall(r'\d+', txt1) facet_count = txt2[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def htmlPass(url, **g): try: # ================================================================ # EXTRACT HTML USING PARSER OR WEB DRIVER CONFIG # ================================================================ if g['USES_WEB_DRVR'] == 'N': #requests.get('https://api.github.com/user', auth=('user', 'pass')) #headers = {'User-agent': 'Google Chrome'} #html = requests.get(url, headers=headers) html = Request(url) html.add_header = [('User-agent', 'Google Chrome')] html = urlopen(html).read() elif g['USES_WEB_DRVR'] == 'Y': driver = webdriver.Chrome( executable_path=str(g['DRVR_PATH'] + '\\' + g['WEB_DRVR_NME'])) #chromeDrvr driver.get(url) # SLEEP REQUIRED DUE TO SEEK TRYING TO REDIRECT PAGE AND MESSING WITH THE CAPTURE OF LINK # FORCES A WAIT FOR PAGE TO PROPERLY RENDER BEFORE CAPTURING HTML if 'SEEK' in url.upper(): time.sleep( 10 ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE html = driver.page_source driver.close() driver.quit() driver.stop_client() return html except: # capture a finish time to be entered into the db finished_at = time.strftime("%Y-%m-%d %H:%M:%S") e = sys.exc_info() print('ERROR ENCOUNTERED : ' + str(e)) # ============================================================================= # WRITE RESULTS OF ERROR TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) dbmgr.write_log(finished_at, 'HTML PASSING ERROR: ' + str(e), **g) # ============================================================================= # EMAIL SUMMARY OF ERROR TO TO DISTRIBUTION LIST # ============================================================================= htmlRes = '''<table cellpadding="8" cellspacing="3" border="3"> <tr> <th>msmt_date_id</th> <th>pkg_nme</th> <th>start_datetime</th> <th>end_datetime</th> <th>status</th> </tr>''' htmlRes = htmlRes + '<tr><td>' + str( g['MSMT_DTE_ID']) + '</td><td>' + str( g['PKG_NME']) + '</td><td>' + str( g['STARTED_AT']) + '</td><td>' + str( finished_at) + '</td><td>' + 'ERROR' + '</td></tr>' htmlRes = htmlRes + '</table>' # SUBJECT & RECIPIENTS mymail = pyMail( str(g['PKG_NME']) + ' - ERROR ENCOUNTERED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g) # START HTML BODY (GREETING / OPENING LINE OF EMAIL). mymail.htmladd( 'A HTML PASSING ERROR was encountered for package : ' + str(g['PKG_NME'])) # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED) # ADD LINE OF TEXT mymail.htmladd('Summary of ERROR') # ADD HTML TABLE CONSTRUCTED ABOVE mymail.htmladd(htmlRes) # HEADER FOR ERROR TEXT mymail.htmladd('<b><u>ERROR DETAIL</u></b>') # ADD FULL ERROR TO BODY OF EMAIL mymail.htmladd(str(e).replace('<', '(').replace('>', ')')) # SEND mymail.send() # QUIT EXECUTION OF PYTHON SCRIPT quit()
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY COUNT ===================================================================== facet_type = 'INDUSTRY' industry = g['INDUSTRY'] industries_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in industry.split(','): # COMMA, OR OTHER industries_array.append(item) # LOOP THROUGH ALL THE ITEM IN REGIONS for industry in industries_array: facet_desc = industry.upper().replace('-JOBS', '') facet_desc = facet_desc.replace(r'/', '') for i in range(10): print("iteration {0} ({1}) starting".format(i, facet_desc)) while True: try: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + r'/{}'.format(industry) passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) #facet_count = re.search(r'JOBS 1 TO 10 OF(.*?)</DIV>', str(soup).encode("utf-8","ignore").decode('ascii', 'ignore').upper()).group(1) facet_count = re.search( r'PAGE 1 OF(.*?)JOBS</DIV>', str(soup).encode("utf-8", "ignore").decode( 'ascii', 'ignore').upper()).group(1) facet_count = int(facet_count.replace(',', '')) except: e = sys.exc_info() print("iteration {0} ({1}) failed with error : {2}".format( i, facet_desc, e)) continue break # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) break else: None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # COLLECT ALL JOBS RELATED LINKS catLinks = [] for links in soup.find_all('a'): full_ref = str(links) link_txt = str(links.get('href')) if '/JOBS-IN-' in full_ref.upper() and not ('PRIORITY' in full_ref.upper()): catLinks.append(link_txt) #print(catLinks) # PASS 1 - INDUSTRY COUNT ===================================================================== for link in catLinks: facet_type = 'INDUSTRY' for i in range(10): print("iteration {0} ({1}) starting".format(i, link)) while True: try: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) title_txt = soup.title.string.upper() idx = title_txt.find(' JOBS') facet_desc = title_txt[:idx] #print(facet_desc) for span in soup.find_all('span', id='SearchSummary'): for h1 in span.find_all('h1'): nbr = re.search('COUNT">(.*?)</STRONG>', str(soup).upper()).group(0) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] #print(facet_count) except: e = sys.exc_info() print("iteration {0} ({1}) failed with error : {2}".format( i, facet_desc, e)) continue break # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) break else: None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 2 - TOTAL COUNT ======================================================================== time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) facet_type = 'TOTAL' facet_desc = 'ALL JOBS' nbr = re.search('COUNT">(.*?)</STRONG>', str(soup).upper()).group(0) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_').replace('/', '-') + '_' + facet_desc.replace( ' ', '_').replace('/', '-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 3 - REGION COUNT ======================================================================= facet_type = 'REGION' regions = g['REGIONS'] regions_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in regions.split(','): # COMMA, OR OTHER regions_array.append(item) # LOOP THROUGH ALL THE ITEM IN REGIONS for region in regions_array: for i in range(10): print("iteration {0} ({1}) starting".format(i, region)) while True: try: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] + g[ 'URL_PART2'] + '{}'.format(region.replace(' ', '-')) passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) facet_desc = str(region.upper()) nbr = re.search('COUNT">(.*?)</STRONG>', str(soup).upper()).group(0) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] except: e = sys.exc_info() print("iteration {0} ({1}) failed with error : {2}".format( i, facet_desc, e)) continue break # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) break else: None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() else: None # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - CATEGORY COUNT ===================================================================== for div in soup.find_all('div', id="category"): for div_sub in div.find_all('div'): txt = re.findall(r'SECTORS">(.+?)</A>', str(div_sub).upper()) # find any within brackets txt = str(txt[0]) facet_type = txt.upper() #print(facet_type) for li in div.find_all('li'): txt = re.findall(r'JOBS">(.+?)</A>', str(li).upper()) # find any within brackets txt = str(txt[0]) facet_desc = txt.upper().replace('&', '&') find_nbr = re.findall('\([0-9]*\)', str(li)) # find any within brackets if find_nbr: find_nbr = str(find_nbr[0]) facet_count = find_nbr.replace('(', '').replace(')', '') facet_count = int(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 2 - REGION COUNT ======================================================================= for div in soup.find_all('div', id="location"): for div_sub in div.find_all('div'): txt = re.findall(r'LOCATIONS">(.+?)</A>', str(div_sub).upper()) # find any within brackets txt = str(txt[0]) facet_type = txt.upper() #print(facet_type) for li in div.find_all('li'): txt = re.findall(r'">(.+?)</A>', str(li).upper()) # find any within brackets txt = str(txt[0]) facet_desc = txt.upper().replace('&', '&') find_nbr = re.findall('\([0-9]*\)', str(li)) # find any within brackets if find_nbr: find_nbr = str(find_nbr[0]) facet_count = find_nbr.replace('(', '').replace(')', '') facet_count = int(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 3 - JOB TYPE =========================================================================== for div in soup.find_all('div', id="subcategory"): for div_sub in div.find_all('div'): txt = re.findall(r'TYPES">(.+?)</A>', str(div_sub).upper()) # find any within brackets txt = str(txt[0]) facet_type = txt.upper() #print(facet_type) for li in div.find_all('li'): txt = re.findall(r'JOBS">(.+?)</A>', str(li).upper()) # find any within brackets txt = str(txt[0]) facet_desc = txt.upper().replace('&', '&') find_nbr = re.findall('\([0-9]*\)', str(li)) # find any within brackets if find_nbr: find_nbr = str(find_nbr[0]) facet_count = find_nbr.replace('(', '').replace(')', '') facet_count = int(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8","ignore").decode('ascii', 'ignore') #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # COLLECT ALL JOBS RELATED LINKS regionLinksList = [] industryLinksList = [] jobtypeLinksList = [] # PASS 1 - COLLECT LINKS FOR THE VARIOUS TYPES ==================================== for ul in soup.find_all('ul', class_='provinceList'): for links in ul.find_all('a'): link = str(links.get('href')) regionLinksList.append(link) #print(regionLinksList) for ul in soup.find_all('ul', class_='categoryList'): for links in ul.find_all('a'): link = str(links.get('href')) industryLinksList.append(link) #print(industryLinksList) #for ul in soup.find_all('ul', class_='studentsList'): # for links in ul.find_all('a'): # link = str(links.get('href')) # jobtypeLinksList.append(link) # PASS 2 - COLLECT REGION DATA ==================================================== facet_type = 'REGION' for link in regionLinksList: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link) #url = link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) for h1 in soup.find_all('h1'): for links in h1.find_all('a'): facet_desc = links.text.upper().replace('JOBS', '').strip() link = str(links.get('href')) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link) #url = link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") for div in soup.find_all('div', class_='ResultText'): #result-count for span in div.find_all('span', class_='ResultText-numTotal'): facet_count = span.text #re.search(r' of(.*?)</strong>',str(strong)).group(1) #facet_count = strong.text.upper() #facet_count = facet_count.split('OF',1)[1] facet_count = facet_count.strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace( ' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 3 - COLLECT INDUSTRY DATA ================================================== facet_type = 'INDUSTRY' for link in industryLinksList: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link) #url = link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) for h1 in soup.find_all('h1'): for links in h1.find_all('a'): facet_desc = links.text.upper().replace('JOBS', '').strip() link = str(links.get('href')) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link) #url = link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") for div in soup.find_all('div', class_='ResultText'): for span in div.find_all('span', class_='ResultText-numTotal'): facet_count = span.text #re.search(r' of(.*?)</strong>',str(strong)).group(1) facet_count = facet_count.strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace( ' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 4 - COLLECT JOBTYPE DATA =================================================== facet_type = 'JOB TYPE' for link in jobtypeLinksList: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link) #url = link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) for h1 in soup.find_all('h1', class_='sr-search-title'): facet_desc = h1.text.upper().replace('JOBS', '').strip() #print(facet_desc) for div in soup.find_all('div', class_='result-count'): for p in div.find_all('p'): facet_count = p.text.upper() facet_count = facet_count.split('OF', 1)[1] facet_count = facet_count.strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS #soup = str(soup) #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - TOTAL COUNT ======================================================================== facet_type = 'TOTAL' facet_desc = 'ALL JOBS' nbr = re.search('<title>(.*?)</title>', str(soup.encode("utf-8"))).group(1) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] facet_count = int(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 2 - INDUSTRY COUNT ===================================================================== for ul in soup.find_all('ul', class_='facet'): for li in ul.find_all('li'): # RETURN THE FACET TEXT (SECTION TITLE) facet = li.find( 'strong' ) # ASSUMES THE FIRST ROW OF THE FACET IS THE "TITLE" ROW - BREAKS IF IT ISNT if facet: facet_type = facet.text.upper() else: facet_type = facet_type.upper( ) # IF NONE IS FOUND, APPLY CURRENT FACET_TYPE VALUE TO NEXT FACET_TYPE VALUE facet_desc = li.find('a') if facet_desc: # CHECKS IF THERE IS A RESULT ON THE SEARCH FOR THE "A" ANCHOR (REMOVES THE TITLE OF THE SECTIONS BY DEFAULT - RETURNED ABOVE) facet_desc = facet_desc.text.upper() facet_desc = re.sub( r"[!@#$']", '', str(facet_desc)) # REMOVES SPECIAL CHARACTERS FROM STRING facet_count = li.find('span') facet_count = int(facet_count.text.replace(',', '')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) else: # IF NO "A" ANCHOR IS FOUND, IGNORE None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (captr_dte_id = {1} or captr_dte_id <= {2})""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] ) dbmgr.query(q) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - COMMODITY DATA ===================================================================== quandl.ApiConfig.api_key = g['QUANDL_API_KEY'] # quandl.get("NASDAQOMX/NQCIGCTR", authtoken="-7YMD_XEY7yvNsYDX92s") # GOLD # quandl.get("NASDAQOMX/NQCISITR", authtoken="-7YMD_XEY7yvNsYDX92s") # SILVER # quandl.get("NASDAQOMX/NQCICUTR", authtoken="-7YMD_XEY7yvNsYDX92s") # COPPER # quandl.get("NASDAQOMX/NQCICLTR", authtoken="-7YMD_XEY7yvNsYDX92s") # CRUDE OIL LIGHT # quandl.get("NASDAQOMX/NQCICBTR", authtoken="-7YMD_XEY7yvNsYDX92s") # CRUDE OIL BRENT # quandl.get("NASDAQOMX/NQCINGTR", authtoken="-7YMD_XEY7yvNsYDX92s") # NATURAL GAS # quandl.get("NASDAQOMX/NQCIKCTR", authtoken="-7YMD_XEY7yvNsYDX92s") # COFFEE # quandl.get("NASDAQOMX/NQCICCTR", authtoken="-7YMD_XEY7yvNsYDX92s") # COCOA for key, value in g['CMDTY'].items(): try: data_type = key # RETURN DATAFRAME dat = quandl.get(value, authtoken=g['QUANDL_API_KEY'], rows=5) for index, row in dat.iterrows(): #print( index, row[0], row[1], row[2], row[3], row[4]) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, TRADE_DT, INDEX_VAL, HIGH_VAL, LOW_VAL, TTL_MRKT_VAL, DIV_MRKT_VAL, CAPTR_DTE_ID, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', {9}, '{10}', '{11}')""".format( g['TBL_NME'], #[0] index.strftime('%Y%m%d'), #[1] time.strftime('%Y%m%d') data_type, #[2] index.strftime('%Y-%m-%d'), #[3] row[0], #[4] row[1], #[5] row[2], #[6] row[3], #[7] row[4], #[8] g['MSMT_DTE_ID'], #[9] g['STARTED_AT'], #[10] '' #[11] ) dbmgr.query(q) except: # capture a finish time to be entered into the db finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # ============================================================================= # WRITE RESULTS OF ERROR TO LOCAL DB # ============================================================================= e = sys.exc_info() dbmgr = pyDB(g['DB']) dbmgr.write_log(finished_at, 'QUANDL API ERROR: ' + str(e), **g) # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE captr_dte_id = {2}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['MSMT_DTE_ID'] #[2] ) dbmgr.query(q)
def tweepySearch(searchQuery, sinceId, sentmnt_mtch, place_id, cc, **g): # SETUP TWITTER AUTHORISATION auth = tweepy.AppAuthHandler(g['TWIT_CNSMR_KEY'], g['TWIT_CNSMR_SECRET']) api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) if (not api): print('UNABLE TO AUTHENTICATE') sys.exit(-1) # IF RESULTS ONLY BELOW A SPECIFIC ID ARE, SET MAX_ID TO THAT ID. # ELSE DEFAULT TO NO UPPER LIMIT, START FROM THE MOST RECENT TWEET MATCHING THE SEARCH QUERY. max_id = -1 #L tweetCount = 0 print("Downloading max {0} tweets".format(g['TWIT_MAX_TWEETS'])) while tweetCount < int(g['TWIT_MAX_TWEETS']): try: if (max_id <= 0): if (not sinceId): tweets = api.search( q=searchQuery, count=int( g['TWIT_TWEETS_PER_QRY'])) #q=searchQuery else: tweets = api.search(q=searchQuery, count=int( g['TWIT_TWEETS_PER_QRY']), since_id=sinceId) else: if (not sinceId): tweets = api.search(q=searchQuery, count=int( g['TWIT_TWEETS_PER_QRY']), max_id=str(max_id - 1)) else: tweets = api.search(q=searchQuery, count=int( g['TWIT_TWEETS_PER_QRY']), max_id=str(max_id - 1), since_id=sinceId) if not tweets: print("No more tweets found") break # ============================================================================= # PROCESS TWEETS COLLECTED FROM THE SEARCH API PROCESS # ============================================================================= for tweet in tweets: # ============================================================================= # WRITE RESULTS OF EACH TWEET TO LOCAL DB # ============================================================================= created_at = str(tweet.created_at).split(' ') created_at = created_at[0].replace('-', '') #print(tweet.encode('ascii', 'replace').decode("utf-8")) dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, CREATED_AT, TWEET_ID, USER_ID, USER_NAME, USER_SCREEN_NAME, USER_LOCATION, CNTRY_ID, CNTRY_CDE, PLACE_NAME, SENTMT_MATCH, TWEET_TXT, IN_REPLY_TO, RE_TWEETED, PRCES_DTE_ID, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', {3}, {4}, '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}', '{14}', {15}, '{16}', '{17}')""".format( g['TBL_NME'], #[0] created_at, #[1] str(tweet.created_at), #[2] tweet.id, #[3] tweet.user.id, #[4] str( tweet.user.name.encode( 'ascii', 'replace').decode("utf-8")).replace( '?', '').replace("'", '').rstrip().lstrip(), #[5] str( tweet.user.screen_name.encode( 'ascii', 'replace').decode("utf-8")).replace( '?', '').replace("'", '').rstrip().lstrip(), #[6] str( tweet.user.location.encode( 'ascii', 'replace').decode("utf-8")).replace( '?', '').replace( "'", '').rstrip().lstrip().upper(), #[7] str(place_id), #[8] cc, #[9] str( tweet.place.name.encode( 'ascii', 'replace').decode("utf-8")).replace( '?', '').replace( "'", '').rstrip().lstrip().upper(), #[10] sentmnt_mtch, #[11] str( tweet.text.encode( 'ascii', 'replace').decode("utf-8")).replace( '?', '').replace("'", '').replace( '\n', '. ').replace('. . ', '. '), #[12] ('NOVAL' if tweet.in_reply_to_status_id_str is None else str( tweet.in_reply_to_status_id_str).upper()), #[13] str(tweet.retweeted).upper(), #[14] g['MSMT_DTE_ID'], #[15] g['STARTED_AT'], #[16] '' #[17] ) #print(q) dbmgr.query(q) tweetCount += len(tweets) print("Downloaded {0} tweets".format(tweetCount)) max_id = tweets[-1].id except tweepy.TweepError as e: # capture a finish time to be entered into the db finished_at = time.strftime("%Y-%m-%d %H:%M:%S") e = sys.exc_info() print('ERROR ENCOUNTERED : ' + str(e)) # ============================================================================= # WRITE RESULTS OF ERROR TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) dbmgr.write_log(finished_at, 'TWITTER SEARCH ERROR : ' + str(e), **g) # ============================================================================= # EMAIL SUMMARY OF ERROR TO TO DISTRIBUTION LIST # ============================================================================= htmlRes = '''<table cellpadding="8" cellspacing="3" border="3"> <tr> <th>msmt_date_id</th> <th>pkg_nme</th> <th>start_datetime</th> <th>end_datetime</th> <th>status</th> </tr>''' htmlRes = htmlRes + '<tr><td>' + str( g['MSMT_DTE_ID'] ) + '</td><td>' + str(g['PKG_NME']) + '</td><td>' + str( g['STARTED_AT']) + '</td><td>' + str( finished_at) + '</td><td>' + 'ERROR' + '</td></tr>' htmlRes = htmlRes + '</table>' # SUBJECT & RECIPIENTS mymail = pyMail( str(g['PKG_NME']) + ' - ERROR ENCOUNTERED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g) # START HTML BODY (GREETING / OPENING LINE OF EMAIL). mymail.htmladd( 'A TWITTER SEARCH ERROR was encountered for package : ' + str(g['PKG_NME'])) # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED) # ADD LINE OF TEXT mymail.htmladd('Summary of ERROR') # ADD HTML TABLE CONSTRUCTED ABOVE mymail.htmladd(htmlRes) # HEADER FOR ERROR TEXT mymail.htmladd('<b><u>ERROR DETAIL</u></b>') # ADD FULL ERROR TO BODY OF EMAIL mymail.htmladd(str(e).replace('<', '(').replace('>', ')')) # SEND mymail.send() # QUIT EXECUTION OF PYTHON SCRIPT # EXIT IF ANY ERROR print("some error : " + str(e)) break print("Downloaded {0} tweets".format(tweetCount))
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY COUNT ===================================================================== facet_type = 'INDUSTRY' for div in soup.find_all('div', class_="content-holder container"): for a in div.find_all('a'): for span in a.find_all('span'): if 'TITLE' in str(span).upper(): txt = span.text.upper().replace(r"'", '') elif 'COUNT' in str(span).upper(): nbr = re.findall('\d+', span.text) facet_desc = txt facet_count = int(str(nbr[0])) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY & REGION COUNT ============================================================ for div in soup.find_all('div', id='ajaxRefineSearch'): for ref in div.find_all('div', class_='refineItem'): refText = str(ref).upper() # FACET TYPE if 'LOCATION_STATE' in refText: facet_type = 'REGION' elif 'EMPLOYMENTTYPE' in refText: facet_type = 'JOB TYPE' elif 'COMPANYNAME' in refText: facet_type = 'COMPANY NAME' elif 'JOBCATEGORY' in refText: facet_type = 'INDUSTRY' elif 'LOCATION_COUNTRY' in refText: facet_type = 'LOCATION' elif 'SALARYTYPE' in refText: facet_type = 'SALARY ESTIMATE' # FACET DESCRIPTION for links in ref.find_all('a'): linkText = links.string.upper() facet_desc = linkText try: # IGNORES ENTRIES THAT HAVE NO nbr VAL # NUMBER VALUE nbr = re.search(r'\((\d+(?:\.\d+)?)\)', refText).group(1) nbr = str(nbr).replace(',', '') facet_count = nbr #facet_count = re.findall('\d+', nbr) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace( ' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() except: None else: None # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow,rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY DETAILS =================================================================== for links in soup.find_all('a'): full_ref = str(links) link_txt = str(links.get('href')) if 'JOBS AVAILABLE IN' in full_ref.upper(): facet_type = 'INDUSTRY' facet_desc = links.string.upper() link_nbr = re.findall('\d+', full_ref) facet_count = ''.join(str(e) for e in link_nbr) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) else: None # PASS 2 - REGIONAL DETAILS =================================================================== facet_type = 'REGION' regions = g['REGIONS'] regions_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in regions.split(','): # COMMA, OR OTHER regions_array.append(item) # LOOP THROUGH ALL THE ITEM IN REGIONS for region in regions_array: time.sleep(rndm_sleep) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] + '{}'.format(region.replace(' ','+')) passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS soup = str(soup) # ========================================================================================================================================================== # SCRAPE SUB PART - START # ========================================================================================================================================================== facet_desc = str(region.upper()) facet_count = re.search('1-10 of(.*?)</p>', soup).group(1) facet_count = facet_count.replace(',','').strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) else: None # ========================================================================================================================================================== # SCRAPE SUB PART - END # ========================================================================================================================================================== # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], # [0] g['MSMT_DTE_ID'], # [1] retention_date_id, # [2] g['CNTRY_CDE'], # [3] g['SITE_CDE'] # [4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") # print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - TOTAL COUNT ======================================================================== for div in soup.find_all('div', class_='inner cover'): chk_str = str(div).upper() chk_str = chk_str.replace(',', '') nbr = re.search('SEARCH.<B>(\d*)</B>', chk_str).group(1) facet_type = 'TOTAL' facet_desc = 'ALL JOBS' facet_count = int(nbr) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], # [0] g['MSMT_DTE_ID'], # [1] g['DATA_TYPE'], # [2] g['CNTRY_CDE'], # [3] g['SITE_CDE'], # [4] facet_type, # [5] facet_desc, # [6] facet_count, # [7] g['STARTED_AT'], # [8] '' # [9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(' ','_') + '_' + facet_desc.replace(' ','_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 2 - ALL OTHER FACETS =================================================================== # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") # print(soup) for div in soup.find_all('div', class_="results-filter-content"): for section in div.find_all('section'): # FACET TYPE facet_type = section.find('h3') facet_type = facet_type.text.upper() facet_type = facet_type.replace('HIDE FILTERS', '').replace('DISPLAY FILTERS', '').replace('HELP - EDUCATION OR TRAINING', '').strip() if 'REGIONS' in facet_type: facet_type = 'REGIONS' elif 'CATEGORIES' in facet_type: facet_type = 'CATEGORY' else: facet_type = facet_type # print(facet_type) # FACET DESCRIPTION AND COUNT for li in section.find_all('li'): txt = li.text txt = txt.replace('\\', '~').replace('\n', '~').replace('\r', '~').replace('\t', '~').upper() txt = txt.replace('~', '').replace("'", "").strip() # print(txt) # FACET DESCRIPTION =========================================================== facet_desc = re.findall(' FOUND(.*)', str(txt)) facet_desc = cleanhtml(facet_desc[0]) #facet_desc = str(facet_desc[0]).strip() # print(facet_desc) # FACET COUNT ================================================================= facet_count = re.findall('(\d*)', txt) facet_count = str(facet_count[0]) facet_count = facet_count.replace(',', '') try: facet_count = int(facet_count) except: facet_count = 0 # print(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], # [0] g['MSMT_DTE_ID'], # [1] g['DATA_TYPE'], # [2] g['CNTRY_CDE'], # [3] g['SITE_CDE'], # [4] facet_type, # [5] facet_desc, # [6] facet_count, # [7] g['STARTED_AT'], # [8] '' # [9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], # [0] finished_at, # [1] g['CNTRY_CDE'], # [2] g['MSMT_DTE_ID'] # [3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow,rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + '/browse' passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY DETAILS =================================================================== rndm_sleep = random.randint(rLow,rHigh) facet_type = 'INDUSTRY' link_txt_array = [] for href in soup.find_all('a'): if '/BROWSE/' in str(href).upper() and '-JOBS' in str(href).upper(): # and 'LINK-DEFAULT' not in str(href).upper(): full_ref = str(href) link_txt = str(href.get('href')) if link_txt.count('/') < 5: link_txt_array.append(link_txt.replace('/browse','')) #print(link_txt_array) for link_txt in link_txt_array: # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = link_txt passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) for h1 in soup.find_all('h1'): facet_desc = str(h1.text).upper().replace('BROWSE','').replace('IN AUSTRALIA','').strip() for span in soup.find_all('span', class_='c'): facet_count = str(span.text) facet_count = int(facet_count.replace(',','')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(' ','_').replace('/','-') + '_' + facet_desc.replace(' ','_').replace('/','-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 2 - REGIONAL DETAILS =================================================================== facet_type = 'REGION' regions = g['REGIONS'] regions_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in regions.split(','): # COMMA, OR OTHER regions_array.append(item) # LOOP THROUGH ALL THE ITEM IN REGIONS for region in regions_array: time.sleep(rndm_sleep) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] + '{}'.format(region.replace(' ','+')) passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS soup = str(soup) # ========================================================================================================================================================== # SCRAPE SUB PART - START # ========================================================================================================================================================== facet_desc = str(region.upper()) facet_count = re.search('1-10 of(.*?)</p>', soup).group(1) facet_count = facet_count.replace(',','').strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(' ','_').replace('/','-') + '_' + facet_desc.replace(' ','_').replace('/','-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f: f.writelines(str(passedHTML)) f.close() else: None # ========================================================================================================================================================== # SCRAPE SUB PART - END # ========================================================================================================================================================== # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def email_status(step): if step == 'START': # SUBJECT & RECIPIENTS mymail = pyMail( g['PKG_NME_PRNT'] + ' : NZ - STARTED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g) # START HTML BODY (GREETING / OPENING LINE OF EMAIL) mymail.htmladd('End Of Message') # SEND mymail.send(**g) elif step == 'END': # ============================================================================= # LOOPS THROUGH TABLE LIST AND GENERATES SUMMARY DATA FOR EMAIL # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""SELECT msmt_dte_id, cntry_cde, count( * ) AS row_cnt, sum(facet_cnt) as job_count FROM {0} WHERE cntry_cde = 'NZ' GROUP BY msmt_dte_id, cntry_cde ORDER BY msmt_dte_id DESC LIMIT 5""".format( 'WEBDATA_JOBADS') rslt = dbmgr.query(q) # ============================================================================= # EMAIL SUMMARY OF RESULTS TO DISTRIBUTION LIST # ============================================================================= htmlRes = '''<table cellpadding="8" cellspacing="3" border="1"> <tr> <th>msmt_date_id</th> <th>cntry_cde</th> <th>row_cnt</th> <th>job_cnt</th> </tr>''' for r in rslt: htmlRes = htmlRes + '<tr><td>' + str(r[0]) + '</td><td>' + str( r[1]) + '</td><td>' + str(r[2]) + '</td><td>' + str( r[3]) + '</td></tr>' htmlRes = htmlRes + '</table>' # ============================================================================= # LOOPS THROUGH TABLE LIST AND GENERATES SECONDARY SUMMARY DATA FOR EMAIL # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""select max(MSMT_DTE_ID) as msmt_dte_id, CNTRY_CDE, SITE_CDE, SUM( CURR_ROW_CNT ) AS CURR_ROW_CNT, SUM( PREV_ROW_CNT ) AS PREV_ROW_CNT, SUM( CURR_FACET_CNT ) AS CURR_FACET_CNT, SUM( PREV_FACET_CNT ) AS PREV_FACET_CNT from ( select msmt_dte_id, cntry_cde, site_cde, case when MSMT_DTE_ID = strftime( '%Y%m%d', date( 'now', 'localtime' ) ) then count(*) else 0 end as CURR_ROW_CNT, case when MSMT_DTE_ID = strftime( '%Y%m%d', date( 'now', 'localtime', '-1 day' ) ) then count(*) else 0 end as PREV_ROW_CNT, cast(case when MSMT_DTE_ID = strftime( '%Y%m%d', date( 'now', 'localtime' ) ) then sum( FACET_CNT ) else 0 end as INTEGER) as CURR_FACET_CNT, cast(case when MSMT_DTE_ID = strftime( '%Y%m%d', date( 'now', 'localtime', '-1 day' ) ) then sum( FACET_CNT ) else 0 end as INTEGER) as PREV_FACET_CNT from WEBDATA_JOBADS where 1 = 1 and cntry_cde = 'NZ' and MSMT_DTE_ID >= strftime( '%Y%m%d', date( 'now', 'localtime', '-1 day' ) ) group by msmt_dte_id, cntry_cde, site_cde ) group BY CNTRY_CDE, SITE_CDE order by 1, 3""".format('WEBDATA_JOBADS') rslt = dbmgr.query(q) # ============================================================================= # EMAIL SUMMARY OF RESULTS TO DISTRIBUTION LIST # ============================================================================= htmlRes2 = '''<table cellpadding="8" cellspacing="3" border="1"> <tr> <th>msmt_date_id</th> <th>cntry_cde</th> <th>site_cde</th> <th>curr_row_cnt</th> <th>prev_row_cnt</th> <th>curr_facet_cnt</th> <th>prev_facet_cnt</th> </tr>''' for r in rslt: htmlRes2 = htmlRes2 + '<tr><td>' + str(r[0]) + '</td><td>' + str( r[1]) + '</td><td>' + str(r[2]) + '</td><td>' + str( r[3]) + '</td><td>' + str(r[4]) + '</td><td>' + str( r[5]) + '</td><td>' + str(r[6]) + '</td></tr>' htmlRes2 = htmlRes2 + '</table>' # SUBJECT & RECIPIENTS mymail = pyMail( g['PKG_NME_PRNT'] + ' : NZ - ENDED @ ' + time.strftime("%Y-%m-%d %H:%M:%S"), **g) # START HTML BODY (GREETING / OPENING LINE OF EMAIL) mymail.htmladd('Scrape has completed for : ' + g['PKG_NME_PRNT'] + ' : UK') # FURTHER DETAILS ADDED TO BODY (SEPERATED BY A PARAGRAPH SO LINE FEEDS NOT REQUIRED) # ADD LINE OF TEXT mymail.htmladd('Summary of Scrape for ' + g['PKG_NME_PRNT'] + ' : UK') # ADD HTML TABLE CONSTRUCTED ABOVE mymail.htmladd(htmlRes) # ADD LINE OF TEXT mymail.htmladd('CURR and PREV days comparison') # ADD HTML TABLE CONSTRUCTED ABOVE mymail.htmladd(htmlRes2) # SEND mymail.send(**g)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY AND REGION COUNTS ========================================================= for div in soup.find_all('div', class_='srp-filter-panel--inner'): for links in soup.find_all('a'): full_ref = str(links) link_txt = str(links.get('href')) match_pattern = re.search(r'\((.*?)\)', full_ref) if 'SRP-LIST-FILTER__ITEM' in str(links).upper( ) and match_pattern is not None and '/S-JOBS/C9302' not in str( links).upper(): #and 'AD=OFFERING' in str(links).upper() if '/S-JOBS/ACT' in str(links).upper() or '/S-JOBS/NSW' in str( links).upper() or '/S-JOBS/NT' in str(links).upper( ) or '/S-JOBS/QLD' in str( links).upper() or '/S-JOBS/SA' in str(links).upper( ) or '/S-JOBS/TAS' in str(links).upper( ) or '/S-JOBS/VIC' in str(links).upper( ) or '/S-JOBS/WA' in str(links).upper(): facet_type = 'REGION' elif '/S-JOBS/JOBTYPE' in str(links).upper(): facet_type = 'JOBTYPE' elif '/S-JOBS/ADVERTISEDBY' in str(links).upper(): facet_type = 'ADVERTISED' else: facet_type = 'INDUSTRY' #<a class="srp-list-filter__item-link link link--no-underline" href="/s-trades-services/c22340?ad=offering">Trades & Services (7,955)</a> try: objText = re.search(r'">(.*?)</a>', str(full_ref)).group(1) facet_desc = objText.upper().replace('&', '&') facet_desc = re.sub(r'\((.*?)\)', "", facet_desc) facet_desc = re.sub('[^A-Za-z0-9&' ' ]', '', facet_desc) facet_desc = facet_desc.strip() facet_count = re.search(r'\((.*?)\)', str(links)).group(1) facet_count = int(facet_count.replace(',', '')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) #print(q) dbmgr.query(q) except ValueError: pass # it was a string, not an int. # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - TOTAL COUNT ======================================================================== facet_type = 'TOTAL' facet_desc = 'ALL JOBS' nbr = re.search('>(.*?)jobs</span>', str(soup)).group(1) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 2 - INDUSTRY COUNT ===================================================================== for links in soup.find_all('a'): link_txt = str(links.get('href')) link_nbr = re.findall('\d+', link_txt) link_nbr_ = ''.join(str(e) for e in link_nbr) if link_nbr_: nbr_chk = int(link_nbr_) else: nbr_chk = 0 if 'JOBS-IN-' in link_txt.upper(): facet_type = 'REGION' else: facet_type = 'INDUSTRY' # FINAL ASSIGNMENTS facet_desc = links.string.upper() facet_desc = re.sub( r"[!@#$']", '', str(facet_desc)) # removes special characters from string if 'JOBS-' in link_txt.upper() and nbr_chk <= g[ 'REGION_CHK_ID']: # if href matches what is considered relevant, do the following time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + link_txt passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS #soup = str(soup) try: nbr = re.search(r'1 to(.*?)jobs', str(soup.encode("utf-8"))).group(1) facet_count = int(nbr.strip().replace( nbr.strip().rpartition(' ')[0], '')) except: facet_count = 0 # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() else: None # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - TOTAL COUNT ======================================================================== facet_type = 'TOTAL' facet_desc = 'ALL JOBS' nbr = re.search('<title>(.*?)</title>', str(soup.encode("utf-8"))).group(1) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 2 - INDUSTRY COUNT ===================================================================== for ul in soup.find_all('ul', class_='facet'): for li in ul.find_all('li'): # return the facet text (section title) facet = li.find( 'strong' ) # assumes the first row of the facet is the "title" row - breaks if it isnt if facet: facet_type = facet.text.upper() else: facet_type = facet_type.upper( ) # if None is found, apply current facet_type value to next facet_type value facet_desc = li.find('a') if facet_desc: # checks if there is a result on the search for the "a" anchor (removes the title of the sections by default - returned above) try: facet_desc = facet_desc.text.upper() facet_desc = re.sub(r"[!@#$']", '', str( facet_desc)) # removes special characters from string facet_count = li.find('span') facet_count = int(facet_count.text.replace(',', '')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) except: pass else: # if no "a" anchor is found, ignore None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 3 - REGION COUNT ===================================================================== facet_type = 'REGION' regions = g['REGIONS'] regions_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in regions.split(','): # COMMA, OR OTHER regions_array.append(item) # LOOP THROUGH ALL THE ITEM IN REGIONS for region in regions_array: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = url + g['URL_PART2'] + '{}'.format(region.replace(' ', '+')) passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) facet_desc = str(region.upper()) facet_count = re.search(r'10</span> of <span>(.*?)</span>', str(soup.encode("utf-8"))).group(1) facet_count = int(facet_count.replace(',', '')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace(' ', '_').replace( '/', '-') + '_' + facet_desc.replace(' ', '_').replace( '/', '-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() else: None # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY & REGION ========================================================== for div in soup.find_all('div', id="centre_col"): for a in div.find_all('a'): if r"/IN/" in str(a).upper(): facet_type = 'REGION' else: facet_type = 'INDUSTRY' dest_url1 = str(a.get('href')) time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + dest_url1 passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS #soup = str(soup) # may need to # repasses text back through beautifulsoup interpreter for h1 in soup.find_all('h1'): for a in h1.find_all('a'): dest_url2 = str(a.get('href')) txt = a.text.upper() facet_desc = txt.replace(r"'", '').replace('JOBS', '').strip() # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + dest_url2 passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS #soup = str(soup) # may need to # repasses text back through beautifulsoup interpreter nbr = re.search( r'</SPAN> OF <SPAN>(.*?)</SPAN>', str(soup).encode("utf-8", "ignore").decode( 'ascii', 'ignore').upper()).group(1) nbr = str(nbr).replace(',', '') facet_count = int(nbr) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_').replace( '/', '-') + '_' + facet_desc.replace( ' ', '_').replace('/', '-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (captr_dte_id = {1} or captr_dte_id <= {2})""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] ) dbmgr.query(q) # ============================================================================= # LOOP THROUGH DATES FOR HISTORICAL SCRAPES (ONLY REQUIRED FOR FIRST RUN) # # ============================================================================= # dts = g['MONTH_LIST'] # ONLY NEEDED TO RUN HISTORY first_dy_curr_mth = fdttm.today().replace(day=1) dts = [] dts.append((first_dy_curr_mth - datetime.timedelta(days=1) ).strftime('%b.%Y').lower()) # PREVIOUS MONTH dts.append(time.strftime('%b.%Y').lower()) # CURRENT MONTH dts.append((datetime.date.today() + relativedelta.relativedelta(months=1) ).strftime('%b.%Y').lower()) # NEXT MONTH X1 dts.append((datetime.date.today() + relativedelta.relativedelta(months=2) ).strftime('%b.%Y').lower()) # NEXT MONTH X2 #dts.append( (datetime.date.today() + relativedelta.relativedelta(months=3)).strftime('%b.%Y').lower() ) # NEXT MONTH X3 -- CALENDAR DOESNT GO THIS FAR FORWARD #dts_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in dts: # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + item passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup)#.encode("utf-8")) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # GET MONTH AND YEAR FROM HEADER for div in soup.find_all('div', class_='head'): for span in div.find_all('span'): if '<strong>' in str(span).lower(): dt_part = span.text.upper().split(' ') annce_mth = dt_part[-2] annce_yr = dt_part[-1] for tab in soup.find_all('table', class_='calendar__table'): for row in tab.find_all('tr'): if 'calendar__row' in str(row).lower( ) and 'day-breaker' not in str(row).lower( ) and 'calendarexpanded__container' not in str(row).lower(): cell_nbr = 1 # INITIALISE CELL NBR TO 1 - DATA WILL BE ASSIGNED BASED ON CELL NBR (POSITION) WHICH SHOULDNT CHANGE for cell in row.find_all('td'): #print(cell) if cell_nbr == 1: # DATE OF MONTH (IF NOT NULL) try: dt = re.search('<span>(.*)</span>', str(cell).lower()) dt = dt.group(1).replace('</span>', '').upper() dt_part = dt.split(' ') mth_nme = dt_part[0] dy_nbr = dt_part[1] if len(dy_nbr) == 1: dy_nbr = '0' + str(dy_nbr) else: dy_nbr = str(dy_nbr) mth_nbr = g['MONTH_NBR_CNVRT'].get(mth_nme) annce_dt = str(annce_yr) + '-' + str( mth_nbr) + '-' + str(dy_nbr) msmt_dte_id = str(annce_yr) + str( mth_nbr) + str(dy_nbr) except: annce_dt = annce_dt elif cell_nbr == 2: # TIME OF DAY (MIGHT BE "ALL DAY" EVENT) if cell.text.strip().upper() != '': annce_tm = cell.text.strip().upper() else: try: annce_tm = annce_tm except: annce_tm = '' elif cell_nbr == 3: # CNTRY CDE try: cntry_cde = cell.text.strip().upper() except: cntry_cde = '' elif cell_nbr == 4: # IMPACT (LOW / MEDIUM / HIGH) result = cell.find('span') if result is not None: impact = result.get('title') impact = impact.upper().replace( 'IMPACT EXPECTED', '').strip() else: impact = '' elif cell_nbr == 5: # EVENT DESCRIPTION try: for span in cell.find_all('span'): event_desc = span.text.strip().upper() except: event_desc = '' elif cell_nbr == 6: # -- IGNORE -- LINK TO DETAILS pass elif cell_nbr == 7: # ACTUAL VALUE try: actual_val = cell.text.strip() except: actual_val = '' elif cell_nbr == 8: # FORECAST VALUE try: forecast_val = cell.text.strip() except: forecast_val = '' elif cell_nbr == 9: # PREVIOUS VALUE try: previous_val = cell.text.strip() except: previous_val = '' elif cell_nbr == 10: # -- IGNORE -- LINK TO GRAPH pass else: continue cell_nbr = cell_nbr + 1 # GENERATE A CODE FROM THE DESC AND CRNCY annce_cde = pyLIB.codeGen(cntry_cde + ' ' + event_desc) # GET CODE # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, SITE_CDE, ANNCE_DTE, ANNCE_TM, CNTRY_CDE, ANNCE_CDE, ANNCE_DESC, IMPACT, ACTUAL, FORECAST, PREVIOUS, CAPTR_DTE_ID, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', "{8}", '{9}', '{10}', '{11}', '{12}', {13}, '{14}', '{15}')""".format( g['TBL_NME'], #[0] msmt_dte_id, #[1] g['DATA_TYPE'], #[2] g['SITE_CDE'], #[3] annce_dt, #[4] annce_tm, #[5] cntry_cde, #[6] annce_cde, #[7] event_desc, #[8] impact, #[9] actual_val, #[10] forecast_val, #[11] previous_val, #[12] g['MSMT_DTE_ID'], #[13] g['STARTED_AT'], #[14] '' #[15] ) #print(q) dbmgr.query(q) # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE captr_dte_id = {2}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['MSMT_DTE_ID'] #[2] ) dbmgr.query(q)