def getAdditionalActivityDetails(activityDictionary):

    # Make individual activity page into a beautiful soup
    url = "https://one.pa.gov.sg/CRMSPortal/CRMSPortal.portal?_nfpb=true&_st=&_windowLabel=CRMSPortal_1&_urlType=render&_mode=view&wlpCRMSPortal_1_action=ACMParticipantMaintain&_pageLabel=CRMSPortal_page_1&IdProdInst=" + activityDictionary[
        "activity_code"]
    urlResponseObj = requests.get(url)

    # Check if site is unavailable
    if (urlResponseObj.status_code != 200):
        print("Operator site is unavailable")
        print("    Step: Get additional activity details")
        print("    URL: " + url)
        saveSiteDownStatus(True)
        return

    activityPageHtml = BeautifulSoup(urlResponseObj.text, "html.parser")

    # Extract the activity title (special formatting)
    activityDictionary["title"] = activityPageHtml.find(
        "td", {
            'class': 'course_title'
        }).get_text().replace("\t", "").replace("\r\n", "").strip()

    # Extract the additional course details
    courseDetails = activityPageHtml.find_all("h2", {'class': 'title_main2'})
    for item in courseDetails:
        sectionTitle = " ".join(item.get_text().split()).replace(
            ":", "")  #https://stackoverflow.com/questions/1546226
        sectionBody = " ".join(item.find_next("p").get_text().split())
        activityDictionary[sectionTitle] = sectionBody.strip()

    return activityDictionary
def getAllCourseListing(url, operator):

    cookieResponseObj = requests.post("https://www.onepa.sg/cat/kayaking")
    page = 1
    outputListing = []
    newIterationListingCount = 1

    # while loop added to handle pagination
    while (newIterationListingCount > 0):
        newIterationListingCount = 0
        # Check if site is unavailable
        if (cookieResponseObj.status_code != 200
                or "unavailable" in cookieResponseObj.text):
            print("Operator site is unavailable")
            print("    Step: Get cookie from site")
            print("    Operator: " + operator)
            print("    URL: " + url)
            saveSiteDownStatus(True)
            return

        payload = {
            "cat": "kayaking",
            "subcat": "",
            "sort": "",
            "filter": "[filter]",
            "cp": page
        }
        headers_custom = {"Content-Type": "application/json; charset=UTF-8"}

        # Get Course Listing based on URL argument
        urlResponseObj = requests.post(url,
                                       data=json.dumps(payload),
                                       headers=headers_custom,
                                       cookies=cookieResponseObj.cookies)

        # Check if site is unavailable
        if (urlResponseObj.status_code != 200):
            print("Operator site is unavailable")
            print("    Step: Get a list of courses")
            print("    Operator: " + operator)
            print("    URL: " + url)
            saveSiteDownStatus(True)
            return

        # Extract the course listing in the Json key "d"
        urlResponseJson = json.loads(urlResponseObj.text)

        # Construct a BeautifulSoup Object
        soupHtml = BeautifulSoup(urlResponseJson["d"], "html.parser")

        # Extract out an array of courses
        thisIterCourseListing = soupHtml.find_all("table1")

        for item in thisIterCourseListing:
            if item not in outputListing:
                outputListing.append(item)
                newIterationListingCount = newIterationListingCount + 1
                page = page + 1

    return outputListing
Exemple #3
0
def getAdditionalCourseDetails(courseDictionary):

    url = "https://one.pa.gov.sg/CRMSPortal/CRMSPortal.portal?_nfpb=true&_st=&_windowLabel=CRMSPortal_1&_urlType=render&_mode=view&wlpCRMSPortal_1_action=ACMParticipantMaintain&_pageLabel=CRMSPortal_page_1&IdProdInst=" + courseDictionary[
        "course_code"][
            1:]  # Sliced to remove the preceeding 'C' of the course_code

    # Get Course Listing based on URL argument
    urlResponseObj = requests.get(url)

    # Check if site is unavailable
    if (urlResponseObj.status_code != 200):
        print("Operator site is unavailable")
        print("    Step: Get more information on a specific course")
        print("    URL: " + url)
        saveSiteDownStatus(True)
        return

    soupHtml = BeautifulSoup(urlResponseObj.text, "html.parser")
    courseDetails = soupHtml.find_all("h2", {'class': 'title_main2'})

    # Iterate through additional course details and add them to the course dictionary
    for item in courseDetails:

        sectionTitle = " ".join(item.get_text().split()).replace(
            ":", "")  #https://stackoverflow.com/questions/1546226
        sectionBody = " ".join(item.find_next("p").get_text().split())
        courseDictionary[sectionTitle] = sectionBody.strip()

    return processAdditionalCourseDetails(courseDictionary)
def getOutletActivityListing(url, outletId, operator):

    # Make POST request to get actvities for the club specified in outletId
    headers = {'User-Agent': 'Mozilla/5.0'}
    payload = {
        'idInternalCCRC': outletId,
        "cdSubj": "0",
        "btnGo.x": "42",
        "btnGo.y": "0"
    }

    urlResponseObj = requests.post(url, data=payload, headers=headers)

    # Check if site is unavailable
    if (urlResponseObj.status_code != 200):
        print("Operator site is unavailable")
        print("    Step: Get a list of activities by outlet")
        print("    Operator: " + operator)
        print("    URL: " + url)
        saveSiteDownStatus(True)
        return

    activityDictionaryList = []

    # Get a stream of activity codes
    activityCodeList = re.finditer(
        r'javascript:doRedirectToProducts\(\'([0-9]*)\',\'[a-zA-Z]*\',\'[a-zA-Z]*\'\)',
        urlResponseObj.text)

    # Extract out a list of activity vacancies
    activityVacancyList = re.finditer(
        r'<SPAN class=body_boldtext>Vacancies:</SPAN>\s*(\d+|No Vacancy)',
        urlResponseObj.text)

    # Combine activity code and vacancies into dictionary object
    for activityCode, activityVacancy in zip(activityCodeList,
                                             activityVacancyList):
        activityDictionaryList.append({
            "activity_code": activityCode.group(1),
            "vacancies": activityVacancy.group(1)
        })

    return activityDictionaryList
def getIndividualExpeditionPage(individualExpeditionUrl):

    if individualExpeditionUrl is None or individualExpeditionUrl == "":
        return BeautifulSoup("", "html.parser")

    # Get Expedition Listing based on URL argument
    urlResponseObj = requests.get(individualExpeditionUrl)

    # Check if site is unavailable
    if (urlResponseObj.status_code != 200):
        print("Operator site is unavailable")
        print("    Step: Get a list of expeditions")
        print("    Operator: " + operator)
        print("    URL: " + url)
        saveSiteDownStatus(True)
        return BeautifulSoup("", "html.parser")

    # Construct a BeautifulSoup Object
    return BeautifulSoup(urlResponseObj.text, "html.parser")
Exemple #6
0
def getCourseListing(url, operator):

    # Get Course Listing based on URL argument
    urlResponseObj = requests.get(url)

    # Check if site is unavailable
    if (urlResponseObj.status_code != 200):
        print("Operator site is unavailable")
        print("    Step: Get a list of courses")
        print("    Operator: " + operator)
        print("    URL: " + url)
        saveSiteDownStatus(True)
        return

    # Construct a BeautifulSoup Object
    soupHtml = BeautifulSoup(urlResponseObj.text, "html.parser")

    # Extract out the list of courses
    allCoursesRaw = soupHtml.find("table", {'class': 'sub_table'})
    allCoursesTags = allCoursesRaw.find_all("td")

    # For every line that contains a course code, create a dictionary object with the fields, and save that dictionary object into a list
    courseTableFields = [
        "Course Code", "Organizing", "Venue", "Date", "Time", "Fee",
        "Vacancies", "Action"
    ]
    allCourseDictionary = []
    courseDictionary = {}

    isCourse = False
    courseTableIndex = 1

    for rawLine in allCoursesTags:

        # Pre-processing of rawLine
        line = rawLine.get_text().replace("\t", "").replace("\r\n", "")

        if len(line.strip()) != 0:

            if re.search('^(C\d+)', line) is not None:

                # Add the existing courseDictionary to the allCourseDictionary
                if bool(courseDictionary
                        ):  #Dictionary that has keys will return True
                    allCourseDictionary.append(courseDictionary)

                # Reinitialize the variables
                courseDictionary = {}
                courseTableIndex = 1

                # A new course line item. Save previous course item to course code list
                courseDictionary["course_code"] = line
                isCourse = True

            elif isCourse and courseTableIndex < len(courseTableFields):
                # Save the line with the right table index
                courseDictionary[courseTableFields[courseTableIndex]] = line
                courseTableIndex += 1

    # To add the last course into the allCourseDictionary list
    if bool(courseDictionary):
        allCourseDictionary.append(courseDictionary)

    return allCourseDictionary