def getAdditionalActivityDetails(activityDictionary): # Make individual activity page into a beautiful soup url = "https://one.pa.gov.sg/CRMSPortal/CRMSPortal.portal?_nfpb=true&_st=&_windowLabel=CRMSPortal_1&_urlType=render&_mode=view&wlpCRMSPortal_1_action=ACMParticipantMaintain&_pageLabel=CRMSPortal_page_1&IdProdInst=" + activityDictionary[ "activity_code"] urlResponseObj = requests.get(url) # Check if site is unavailable if (urlResponseObj.status_code != 200): print("Operator site is unavailable") print(" Step: Get additional activity details") print(" URL: " + url) saveSiteDownStatus(True) return activityPageHtml = BeautifulSoup(urlResponseObj.text, "html.parser") # Extract the activity title (special formatting) activityDictionary["title"] = activityPageHtml.find( "td", { 'class': 'course_title' }).get_text().replace("\t", "").replace("\r\n", "").strip() # Extract the additional course details courseDetails = activityPageHtml.find_all("h2", {'class': 'title_main2'}) for item in courseDetails: sectionTitle = " ".join(item.get_text().split()).replace( ":", "") #https://stackoverflow.com/questions/1546226 sectionBody = " ".join(item.find_next("p").get_text().split()) activityDictionary[sectionTitle] = sectionBody.strip() return activityDictionary
def getAllCourseListing(url, operator): cookieResponseObj = requests.post("https://www.onepa.sg/cat/kayaking") page = 1 outputListing = [] newIterationListingCount = 1 # while loop added to handle pagination while (newIterationListingCount > 0): newIterationListingCount = 0 # Check if site is unavailable if (cookieResponseObj.status_code != 200 or "unavailable" in cookieResponseObj.text): print("Operator site is unavailable") print(" Step: Get cookie from site") print(" Operator: " + operator) print(" URL: " + url) saveSiteDownStatus(True) return payload = { "cat": "kayaking", "subcat": "", "sort": "", "filter": "[filter]", "cp": page } headers_custom = {"Content-Type": "application/json; charset=UTF-8"} # Get Course Listing based on URL argument urlResponseObj = requests.post(url, data=json.dumps(payload), headers=headers_custom, cookies=cookieResponseObj.cookies) # Check if site is unavailable if (urlResponseObj.status_code != 200): print("Operator site is unavailable") print(" Step: Get a list of courses") print(" Operator: " + operator) print(" URL: " + url) saveSiteDownStatus(True) return # Extract the course listing in the Json key "d" urlResponseJson = json.loads(urlResponseObj.text) # Construct a BeautifulSoup Object soupHtml = BeautifulSoup(urlResponseJson["d"], "html.parser") # Extract out an array of courses thisIterCourseListing = soupHtml.find_all("table1") for item in thisIterCourseListing: if item not in outputListing: outputListing.append(item) newIterationListingCount = newIterationListingCount + 1 page = page + 1 return outputListing
def getAdditionalCourseDetails(courseDictionary): url = "https://one.pa.gov.sg/CRMSPortal/CRMSPortal.portal?_nfpb=true&_st=&_windowLabel=CRMSPortal_1&_urlType=render&_mode=view&wlpCRMSPortal_1_action=ACMParticipantMaintain&_pageLabel=CRMSPortal_page_1&IdProdInst=" + courseDictionary[ "course_code"][ 1:] # Sliced to remove the preceeding 'C' of the course_code # Get Course Listing based on URL argument urlResponseObj = requests.get(url) # Check if site is unavailable if (urlResponseObj.status_code != 200): print("Operator site is unavailable") print(" Step: Get more information on a specific course") print(" URL: " + url) saveSiteDownStatus(True) return soupHtml = BeautifulSoup(urlResponseObj.text, "html.parser") courseDetails = soupHtml.find_all("h2", {'class': 'title_main2'}) # Iterate through additional course details and add them to the course dictionary for item in courseDetails: sectionTitle = " ".join(item.get_text().split()).replace( ":", "") #https://stackoverflow.com/questions/1546226 sectionBody = " ".join(item.find_next("p").get_text().split()) courseDictionary[sectionTitle] = sectionBody.strip() return processAdditionalCourseDetails(courseDictionary)
def getOutletActivityListing(url, outletId, operator): # Make POST request to get actvities for the club specified in outletId headers = {'User-Agent': 'Mozilla/5.0'} payload = { 'idInternalCCRC': outletId, "cdSubj": "0", "btnGo.x": "42", "btnGo.y": "0" } urlResponseObj = requests.post(url, data=payload, headers=headers) # Check if site is unavailable if (urlResponseObj.status_code != 200): print("Operator site is unavailable") print(" Step: Get a list of activities by outlet") print(" Operator: " + operator) print(" URL: " + url) saveSiteDownStatus(True) return activityDictionaryList = [] # Get a stream of activity codes activityCodeList = re.finditer( r'javascript:doRedirectToProducts\(\'([0-9]*)\',\'[a-zA-Z]*\',\'[a-zA-Z]*\'\)', urlResponseObj.text) # Extract out a list of activity vacancies activityVacancyList = re.finditer( r'<SPAN class=body_boldtext>Vacancies:</SPAN>\s*(\d+|No Vacancy)', urlResponseObj.text) # Combine activity code and vacancies into dictionary object for activityCode, activityVacancy in zip(activityCodeList, activityVacancyList): activityDictionaryList.append({ "activity_code": activityCode.group(1), "vacancies": activityVacancy.group(1) }) return activityDictionaryList
def getIndividualExpeditionPage(individualExpeditionUrl): if individualExpeditionUrl is None or individualExpeditionUrl == "": return BeautifulSoup("", "html.parser") # Get Expedition Listing based on URL argument urlResponseObj = requests.get(individualExpeditionUrl) # Check if site is unavailable if (urlResponseObj.status_code != 200): print("Operator site is unavailable") print(" Step: Get a list of expeditions") print(" Operator: " + operator) print(" URL: " + url) saveSiteDownStatus(True) return BeautifulSoup("", "html.parser") # Construct a BeautifulSoup Object return BeautifulSoup(urlResponseObj.text, "html.parser")
def getCourseListing(url, operator): # Get Course Listing based on URL argument urlResponseObj = requests.get(url) # Check if site is unavailable if (urlResponseObj.status_code != 200): print("Operator site is unavailable") print(" Step: Get a list of courses") print(" Operator: " + operator) print(" URL: " + url) saveSiteDownStatus(True) return # Construct a BeautifulSoup Object soupHtml = BeautifulSoup(urlResponseObj.text, "html.parser") # Extract out the list of courses allCoursesRaw = soupHtml.find("table", {'class': 'sub_table'}) allCoursesTags = allCoursesRaw.find_all("td") # For every line that contains a course code, create a dictionary object with the fields, and save that dictionary object into a list courseTableFields = [ "Course Code", "Organizing", "Venue", "Date", "Time", "Fee", "Vacancies", "Action" ] allCourseDictionary = [] courseDictionary = {} isCourse = False courseTableIndex = 1 for rawLine in allCoursesTags: # Pre-processing of rawLine line = rawLine.get_text().replace("\t", "").replace("\r\n", "") if len(line.strip()) != 0: if re.search('^(C\d+)', line) is not None: # Add the existing courseDictionary to the allCourseDictionary if bool(courseDictionary ): #Dictionary that has keys will return True allCourseDictionary.append(courseDictionary) # Reinitialize the variables courseDictionary = {} courseTableIndex = 1 # A new course line item. Save previous course item to course code list courseDictionary["course_code"] = line isCourse = True elif isCourse and courseTableIndex < len(courseTableFields): # Save the line with the right table index courseDictionary[courseTableFields[courseTableIndex]] = line courseTableIndex += 1 # To add the last course into the allCourseDictionary list if bool(courseDictionary): allCourseDictionary.append(courseDictionary) return allCourseDictionary