Python get_soup Exemples, scraper_base.get_soup Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : ratings_scraper.py Projet : mfekadu/csai-scraping

 def get_prof_page(self, extension):
     try:
         prof_page = scraper_base.get_soup(f"{self.TOP_LINK}{extension}")
     except requests.exceptions.RequestException as e:
         print(WARNING, f"Failed scraping {self.TOP_LINK}{extension}: {e}")
         return None
     else:
         print(DEBUG,
               f"Retrieved professor page from {self.TOP_LINK}{extension}")
         prof_name = prof_page.title.text.strip()
         # Why is all relevant data in a button block? I have no idea.
         main_block = prof_page.findAll('button')[2]
         prof_rating = main_block.find("span", {
             "class": "teacher-rating"
         }).text
         prof_difficulty = main_block.find("span", {
             "class": "evals-span"
         }).text
         if prof_difficulty:
             prof_difficulty = prof_difficulty.split()[1]
         p = {
             "NAME": prof_name,
             "RATING": prof_rating,
             "DIFFICULTY": prof_difficulty
         }
         return p

Exemple #2

0

Afficher le fichier

Fichier : ratings_scraper.py Projet : mfekadu/csai-scraping

 def scrape(self):
     data = []
     print(
         DEBUG,
         f"Starting calpolyratings scrape: TOP_LINK={self.TOP_LINK}, REST_TIME={self.REST_TIME}ms"
     )
     page_num = 1
     while True:
         try:
             soup = scraper_base.get_soup(
                 f"{self.TOP_LINK}/?page={page_num}")
         except requests.exceptions.RequestException as e:
             # Keep trying to get new pages until 404. On 404, return existing data
             if str(e).startswith('404 Client Error'):
                 print(NOTICE, f"Page {page_num} not found. Ending scrape")
                 print(SUCCESS,
                       f"Done! Scraped ratings for {len(data)} professors")
                 return pd.DataFrame(data).to_csv(None, index=False)
             print(ALERT, e)
             return None
         else:
             print(SUCCESS, f"Retrieved page {page_num}")
             page_num += 1
             links = (a['href'] for a in soup.find_all('a', href=True))
             prof_links = [
                 a for a in links
                 if a.startswith('/') and not a.endswith('/')
             ]
             for prof in prof_links:
                 sleep(self.REST_TIME / 1000)
                 page = self.get_prof_page(prof)
                 if page:
                     data.append(page)

Exemple #3

0

Afficher le fichier

 def scrape_schedules_from_url(self, url, verify=True, preprocess=None):
     try:
         soup = scraper_base.get_soup(url, verify)
     except requests.exceptions.RequestException as e:
         print(ALERT, e)
         return None
     print(SUCCESS, "Retrieved schedules page")
     return self.scrape_schedules_from_html(str(soup), preprocess)

Exemple #4

0

Afficher le fichier

Fichier : calendar_scraper.py Projet : MaxMohammadi/csai-scraping

    def scrape(self):
        """
        Scrapes academic calendar data to CSV

        returns:
            str: A CSV string of scraped data
        """
        starting_year = self.CALENDAR_EPOCH
        calendar = dict()

        while True:
            ending_year = starting_year + 1
            current_year = starting_year
            calendar_url = f'{self.TOP_LINK}/{starting_year}-{ending_year - 2000}-academic-calendar'

            try:
                calendar_soup = scraper_base.get_soup(calendar_url)
            # Returns on an invalid school year; should always return.
            except requests.exceptions.HTTPError:
                dates = list(calendar.values())
                return pd.DataFrame(dates).to_csv(None, index=False)
            else:
                # Finds all tables on the page (summer/fall/winter/spring quarters)
                # Excludes the last summary table.
                # Note: summary table id has a space at the end. All years are like this.
                tables = calendar_soup.find_all(
                    lambda tag: tag.name == 'table' and tag.has_attr(
                        'id') and tag['id'] != "SUMMARY OF CALENDAR DAYS ")

                for table in tables:
                    for row in table.find_all('tr'):
                        cols = row.find_all('td')
                        dates = cols[0].text
                        parsed_dates = self.parse_dates(dates, current_year)
                        # Ugly solution to change the calendar year during the school year.
                        # Assumes there will always be an event in January.
                        if 'January' in parsed_dates and current_year != ending_year:
                            current_year = ending_year
                        # Second column is just the days of the week; ignore
                        events = [t.strip() for t in cols[2].text.splitlines()]
                        for month, days in parsed_dates.items():
                            for day in days:
                                date = self.make_date(month, day, current_year)
                                if date in calendar:
                                    calendar[date]['EVENTS'].extend(events)
                                else:
                                    entry = dict()
                                    entry['DATE'] = date
                                    entry['DAY'] = day
                                    entry['MONTH'] = month
                                    entry['YEAR'] = current_year
                                    entry['EVENTS'] = events
                                    calendar[date] = entry

                starting_year += 1

Exemple #5

0

Afficher le fichier

Fichier : faculty_scraper.py Projet : mfekadu/csai-scraping

    def scrape(self):
        """
        Scrapes data from all CPE and CSC employees

        args:
        no_upload (Bool): Used by scraper.py to control uploading vs just returning
            scraped data

        returns:
            str: A CSV string of scraped data
        """
        scraped_faculty = []

        # Verification turned off; read main note in self.parse_single_employee
        soup = scraper_base.get_soup(self.CSC_TOP_LINK, ver=False)
        for link in soup.find_all("a", href=True):
            nav = link["href"]
            if (nav.startswith("/faculty/")
                    or nav.startswith("/staff")) and (nav != "/faculty/"
                                                      and nav != "/staff/"):
                info = self.parse_single_employee("https://csc.calpoly.edu" +
                                                  nav)
                scraped_faculty.append(info)
                sleep(self.REST_TIME / 1000)

        soup = scraper_base.get_soup(self.CPE_TOP_LINK)
        for link in soup.find_all("a", href=True):
            nav = link["href"]
            if (nav.startswith("/faculty/")
                    or nav.startswith("/staff")) and (nav != "/faculty/"
                                                      and nav != "/staff/"):
                info = self.parse_single_employee("https://cpe.calpoly.edu" +
                                                  nav)
                scraped_faculty.append(info)
                sleep(self.REST_TIME / 1000)

        return pd.DataFrame(scraped_faculty).to_csv(None, index=False)

Exemple #6

0

Afficher le fichier

Fichier : club_scraper.py Projet : MaxMohammadi/csai-scraping

    def scrape(self):
        """
        Scrapes club information to CSV

        returns:
            str: A CSV string of scraped data
        """
        top = scraper_base.get_soup(self.TOP_LINK)
        raw = [l.text.strip() for l in top.find_all('span')]
        # Filters out some info we don't need
        info = [x for x in raw if x and x != "Website" and x != "Homepage:"]
        info.pop(0)  # Don't need first line

        current_club = None
        info_len = len(info)
        club_info = dict()
        scraped_clubs = []
        i = 0

        while i < info_len:
            line = info[i]
            if line in self.INFO_ENTRY_PAIRS:
                next_line = info[i + 1]
                entry_name = self.INFO_ENTRY_PAIRS[line]
                if next_line.endswith(':'):
                    club_info[entry_name] = 'NA'
                    i += 1
                else:
                    club_info[entry_name] = next_line
                    i += 2
            elif line == "Contact Email:":
                next_line = info[i + 1]
                try:
                    bool(club_info['CONTACT_EMAIL']
                         )  # Checks if there's already a contact_email entry.
                except KeyError:
                    if next_line.endswith(':'):
                        club_info['CONTACT_EMAIL'] = 'NA'
                        i += 1
                    else:
                        club_info['CONTACT_EMAIL'] = next_line
                        i += 2
                else:
                    # Two fields are called "Contact Email"--the email of the main contact for the club, and the club's
                    # official email to contact them. For now, the official email is called "contact_email_2"
                    if '@' in next_line:  # Next line could be the name of another club; doesn't end in ':'
                        club_info['CONTACT_EMAIL_2'] = next_line
                        i += 2
                    else:
                        club_info['CONTACT_EMAIL_2'] = 'NA'
                        i += 1
            else:
                # Checking len(club_info) filters unused addresses and stuff that get parsed as club names.
                if current_club and len(club_info) != 0:
                    club_info['NAME'] = current_club
                    scraped_clubs.append(club_info)
                club_info = dict()
                current_club = line
                i += 1

        return pd.DataFrame(scraped_clubs).to_csv(None, index=False)

Exemple #7

0

Afficher le fichier

    def scrape(self, all_departments=False):
        """
        Scrapes course information and requirements to CSV

        args:
            all_departments (bool): Scrapes all departments if True, or just CPE
            and CSC if False (default False)

        returns:
            str: A CSV string of scraped data
        """
        # Retrieves department list from Cal Poly
        print(
            DEBUG,
            f"Starting course scrape: all_departments={all_departments}, REST_TIME={self.REST_TIME}"
        )
        print(INFO, "Starting course scrape")
        if all_departments:
            top_link = "http://catalog.calpoly.edu/coursesaz/"
            print(INFO, f"Starting scrape on {top_link}")
            try:
                top_soup = scraper_base.get_soup(top_link, ver=False)
            except requests.exceptions.RequestException as e:
                print(ALERT, e)
                return None
            print(SUCCESS, "Retrieved top-level courses page")
            # Changed scraping method because source for visible links changed, but
            # old links are still in the source and cause some 404 errors
            departments_az = top_soup.find('table')
            department_urls = [
                department.get('href')
                for department in departments_az.find_all('a')
                if department.get('href')
            ]
            if not department_urls:
                print(ALERT,
                      "Couldn't find departments list. Aborting scrape.")
                return None
            print(INFO, f"Found URLs for {len(department_urls)} departments")
        else:
            print(INFO, "Just scraping CSC and CPE courses")
            department_urls = ['/coursesaz/csc/', '/coursesaz/cpe/']

        scraped_courses = []

        # Retrieves course info for each department
        for department in department_urls:
            # Extracts the department name from the URL
            dep_name = (department.rsplit('/', 2)[1]).upper()
            # Gets raw list of courses and info for department
            dep_link = 'http://catalog.calpoly.edu' + department
            try:
                dep_soup = scraper_base.get_soup(dep_link)
                sleep(self.REST_TIME / 1000)
            except requests.exceptions.RequestException as e:
                print(ALERT, e)
                return None
            print(SUCCESS, f"Retrieved {dep_name} courses from {dep_link}")
            courses = dep_soup.findAll("div", {"class": "courseblock"})
            print(DEBUG, f"Found {len(courses)} courses")
            for course in courses:
                course_name_and_units = (course.find(
                    "p", {"class": "courseblocktitle"})).get_text()
                course_name, course_units = course_name_and_units.splitlines()
                print(DEBUG, f"Found {course_name}")
                course_units = course_units.split(' ', 1)[0]
                paragraphs = course.findAll('p')
                if len(paragraphs) == 5:
                    ge_areas = re.findall(r'Area (\w+)', paragraphs[1].text)
                else:
                    ge_areas = None
                course_terms_and_reqs = (course.find(
                    "div",
                    {"class": "noindent courseextendedwrap"})).get_text()

                section = None
                course_prereqs, course_coreqs, course_conc, course_rec, course_terms = [], [], [], [], []
                for word in course_terms_and_reqs.split():
                    if word.endswith(':'):
                        if word == 'Offered:':
                            section = 'terms'
                        # Last term (F,W,SP, etc) will be appended to the front of "Prerequisite:" or whatever category
                        # comes immediately after terms offered, so "str.endswith(blah)" has to be done instead
                        # of "str == blah"
                        elif word.endswith('Prerequisite:'):
                            try:
                                course_terms.append((word.split('Pre'))[0])
                                print(DEBUG, "Found prerequisites")
                            except IndexError:
                                pass
                            section = 'prereq'
                        elif word.endswith('Corequisite:'):
                            try:
                                course_terms.append((word.split('Cor'))[0])
                                print(DEBUG, "Found corequisites")
                            except IndexError:
                                pass
                            section = 'coreq'
                        elif word.endswith('Concurrent:'):
                            try:
                                course_terms.append((word.split('Con'))[0])
                                print(DEBUG, "Found concurrent courses")
                            except IndexError:
                                pass
                            section = 'conc'
                        elif word.endswith('Recommended:'):
                            try:
                                course_terms.append((word.split('Rec'))[0])
                                print(DEBUG, "Found recommended courses")
                            except IndexError:
                                pass
                            section = 'rec'
                        else:
                            pass

                    else:
                        if section == 'prereq':
                            course_prereqs.append(word)
                        elif section == 'coreq':
                            course_coreqs.append(word)
                        elif section == 'conc':
                            course_conc.append(word)
                        elif section == 'rec':
                            course_rec.append(word)
                        elif section == 'terms':
                            course_terms.append(word)
                        else:
                            pass

                # Update: Now joined with a comma
                maybe_join = (lambda x: ','.join(x) if x else 'NA')
                course_prereqs = maybe_join(course_prereqs)
                course_coreqs = maybe_join(course_coreqs)
                course_conc = maybe_join(course_conc)
                course_rec = maybe_join(course_rec)
                course_terms = maybe_join(course_terms)
                ge_areas = maybe_join(ge_areas)

                document = {
                    "DEPARTMENT": dep_name,
                    "COURSE_NAME": course_name,
                    "UNITS": course_units,
                    "PREREQUISITES": course_prereqs,
                    "COREQUISITES": course_coreqs,
                    "CONCURRENT": course_conc,
                    "RECOMMENDED": course_rec,
                    "TERMS_TYPICALLY_OFFERED": course_terms,
                    "GE_AREAS": ge_areas
                }

                scraped_courses.append(document)
        print(SUCCESS, f"Done! Scraped {len(scraped_courses)} courses")
        return pd.DataFrame(scraped_courses).to_csv(None, index=False)

Exemple #8

0

Afficher le fichier

 def scrape_schedules_from_url(self, url, verify=True, preprocess=None):
     soup = scraper_base.get_soup(url, verify)
     return self.scrape_schedules_from_html(str(soup), preprocess)

Exemple #9

0

Afficher le fichier

    def scrape(self):
        """
        Scrapes club information to CSV

        returns:
            str: A CSV string of scraped data
        """
        print(INFO, f'Starting scrape on {self.TOP_LINK}')
        try:
            top = scraper_base.get_soup(self.TOP_LINK)
        except requests.exceptions.RequestException as e:
            print(ALERT, e)
            return None
        print(SUCCESS, f'Retrieved club list')
        raw = [l.text.strip() for l in top.find_all('span')]
        # Filters out some info we don't need
        info = [x for x in raw if x and x != "Website" and x != "Homepage:"]
        info.pop(0)  # Don't need first line

        current_club = None
        info_len = len(info)
        club_info = dict()
        scraped_clubs = []
        i = 0

        while i < info_len:
            line = info[i]
            if line in self.INFO_ENTRY_PAIRS:
                next_line = info[i + 1]
                entry_name = self.INFO_ENTRY_PAIRS[line]
                if next_line.endswith(':'):
                    club_info[entry_name] = 'NA'
                    i += 1
                else:
                    club_info[entry_name] = next_line
                    i += 2
            elif line == "Contact Email:":
                next_line = info[i + 1]
                try:
                    bool(club_info['CONTACT_EMAIL']
                         )  # Checks if there's already a contact_email entry.
                except KeyError:
                    if next_line.endswith(':'):
                        club_info['CONTACT_EMAIL'] = 'NA'
                        i += 1
                    else:
                        club_info['CONTACT_EMAIL'] = next_line
                        i += 2
                else:
                    # Two fields are called "Contact Email"--the email of the main contact for the club, and the club's
                    # official email to contact them. For now, the official email is called "contact_email_2"
                    if '@' in next_line:  # Next line could be the name of another club; doesn't end in ':'
                        club_info['CONTACT_EMAIL_2'] = next_line
                        i += 2
                    else:
                        club_info['CONTACT_EMAIL_2'] = 'NA'
                        i += 1
            else:
                # Checking len(club_info) filters unused addresses and stuff that get parsed as club names.
                if current_club and len(club_info) != 0:
                    club_info['NAME'] = current_club
                    scraped_clubs.append(club_info)
                    print(DEBUG, f'Scraped {current_club}')
                else:
                    print(DEBUG, f'Discarding non-club {current_club}')
                club_info = dict()
                current_club = line
                i += 1

        clubs_request = json.dumps({
            'clubs':
            [self.transform_club_to_db(club) for club in scraped_clubs]
        })
        requests.post(url=self.CLUBS_API, json=clubs_request)

        print(SUCCESS, f'Done! Scraped {len(scraped_clubs)} clubs')
        return pd.DataFrame(scraped_clubs).to_csv(None, index=False)

Exemple #10

0

Afficher le fichier

    def scrape(self):
        """
        Scrapes academic calendar data to CSV

        returns:
            str: A CSV string of scraped data
        """
        print(
            DEBUG,
            f"Starting calendar scrape: CALENDAR_EPOCH={self.CALENDAR_EPOCH}, TOP_LINK={self.TOP_LINK}"
        )
        starting_year = self.CALENDAR_EPOCH
        calendar = dict()

        while True:
            ending_year = starting_year + 1
            current_year = starting_year
            calendar_url = f'{self.TOP_LINK}/{starting_year}-{ending_year - 2000}-academic-calendar'
            print(
                DEBUG,
                f"Attempting to retrieve {starting_year}-{ending_year} calendar from {calendar_url}"
            )
            try:
                calendar_soup = scraper_base.get_soup(calendar_url)
            # Returns on an invalid school year; should always return.
            except requests.exceptions.HTTPError:
                print(
                    NOTICE,
                    f"{starting_year}-{ending_year} calendar doesn't exist. Ending scrape."
                )
                dates = list(calendar.values())
                if len(dates) > 0:
                    print(
                        SUCCESS,
                        f"Done! Scraped {ending_year-self.CALENDAR_EPOCH-1} calendar(s)"
                    )
                else:
                    print(ERR, "Did not successfully scrape any dates")
                    return None
                return pd.DataFrame(dates).to_csv(None, index=False)
            except requests.exceptions.RequestException as e:
                print(ALERT, e)
                return None
            else:
                print(
                    SUCCESS,
                    f"Successfully retrieved {starting_year}-{ending_year} calendar"
                )
                # Finds all tables on the page (summer/fall/winter/spring quarters)
                # Excludes the last summary table.
                # Note: summary table id has a space at the end. All years are like this.
                tables = calendar_soup.find_all(
                    lambda tag: tag.name == 'table' and tag.has_attr(
                        'id') and tag['id'] != "SUMMARY OF CALENDAR DAYS ")
                print(DEBUG, f"Found {len(tables)} tables")
                for table in tables:
                    for row in table.find_all('tr'):
                        cols = row.find_all('td')
                        dates = cols[0].text
                        parsed_dates = self.parse_dates(dates, current_year)
                        # Ugly solution to change the calendar year during the school year.
                        # Assumes there will always be an event in January.
                        if 'January' in parsed_dates and current_year != ending_year:
                            print(
                                DEBUG,
                                f"Switching current year from {current_year} to {ending_year}"
                            )
                            current_year = ending_year
                        # Second column is just the days of the week; ignore
                        events = [t.strip() for t in cols[2].text.splitlines()]
                        for month, days in parsed_dates.items():
                            for day in days:
                                date = self.make_date(month, day, current_year)
                                if date in calendar:
                                    print(DEBUG, f"Adding event to {date}")
                                    calendar[date]['EVENTS'].extend(events)
                                else:
                                    print(DEBUG,
                                          f"Making calendar entry for {date}")
                                    entry = dict()
                                    entry['DATE'] = date
                                    entry['DAY'] = day
                                    entry['MONTH'] = month
                                    entry['YEAR'] = current_year
                                    entry['EVENTS'] = events
                                    calendar[date] = entry

                starting_year += 1

            scraped_calendar_entries = []
            for date, full_entry in calendar.items():
                for event in full_entry['EVENTS']:
                    scraped_calendar_entries.append({
                        'date':
                        full_entry['DATE'],
                        'day':
                        full_entry['DAY'],
                        'month':
                        full_entry['MONTH'],
                        'year':
                        full_entry['YEAR'],
                        'raw_events_text':
                        event,
                    })

            calendars_request = json.dumps(
                {'calendars': scraped_calendar_entries})
            requests.post(url=self.CALENDARS_API, json=calendars_request)

Exemple #11

0

Afficher le fichier

Fichier : faculty_scraper.py Projet : mfekadu/csai-scraping

    def parse_single_employee(self, url):
        """
        Scrapes data from a single Cal Poly employee.

        args:
            url (str)

        returns:
            dict(str:str)
        """

        # Due to certificate issues with CSC employee pages, verification
        # is turned off for requests in the scraper module. This leads to
        # lots of warning during runtime but unaffected data.
        soup = scraper_base.get_soup(url, ver=False)
        name = soup.find("h1").text
        office = 'NA'
        email = 'NA'
        phone = 'NA'
        research_interests = 'NA'

        # Information is stored in different blocks for staff and faculty
        if url.rsplit("/", 3)[1] == "staff":
            main_info_text = [l.text for l in soup.find_all("span")]
            table = soup.find("table")
        else:
            faculty_main_info = soup.find(id="facultyMainBlock")
            table = faculty_main_info.find("table")
            main_info_text = faculty_main_info.text.splitlines()

            # Getting the research interests for every professor
            # Doesn't work for Hasmik Gharibyan and other people who have a biography
            faculty_additional_info = soup.find(class_="facultyBlock")
            if faculty_additional_info is not None and faculty_additional_info.span:
                research_interests = []
                for string in faculty_additional_info.stripped_strings:
                    research_interests.append(string)

        found_office = found_phone = found_email = False

        for line in main_info_text:
            if not found_office:
                if line.startswith("Office:"):
                    office = line.rsplit(" ", 1)[1]
                    found_office = True
            if not found_phone:
                if line.startswith("Phone "):
                    phone = line.rsplit(" ", 1)[1]
                    found_phone = True
            if not found_email:
                if line.startswith("Email:"):
                    # For faculty pages, the character following "Email:" is not a space but \xa0
                    try:
                        first_split = line.split("\xa0", 1)[1]
                    except IndexError:
                        first_split = line.split(" ", 1)[1]
                    email = first_split.split("(", 1)[0] + "@calpoly.edu"
            if found_office and found_phone and found_email:
                break

        # office_hours = dict()

        # Office hour parsing is currently disabled. The previous solution only worked for the CPE department
        # and stored office hours in a dictionary, which is not represented in a CSV file. A separate source
        # of office hours should replace this.
        # #
        # # This method DOES NOT parse office hours for staff, whose office hours are single-line formatted
        # # like: "Mon - Fri: 10:00 am to 5:00 pm". Additional parsing will have to be added for them. For now, their
        # # office hours will show up as 'NA'
        # try:
        #     rows = table.find_all("tr")
        # except AttributeError:
        #     # Best way to represent no office hours? Currently using 'NA' as a string
        #     office_hours = "NA"
        # else:
        #     rows.pop(0)  # Don't need header row
        #     for row in rows:
        #         col = row.find_all("td")
        #         days = (
        #             col[0].text.replace(",", "").split()
        #         )  # Turns "Monday, Wednesday" into ['Monday', 'Wednesday']
        #         time = col[1].text
        #         room = col[2].text
        #         for day in days:
        #             office_hours[day] = dict()
        #             office_hours[day]["time"] = time
        #             office_hours[day]["room"] = room

        faculty_info = {
            "NAME": name,
            "OFFICE": office,
            "EMAIL": email,
            "PHONE": phone,
            "RESEARCH_INTERESTS": research_interests,
        }

        return faculty_info

Exemple #12

0

Afficher le fichier

    def scrape(self, all_departments=False):
        """
        Scrapes course information and requirements to CSV

        args:
            all_departments (bool): Scrapes all departments if True, or just CPE
            and CSC if False (default False)

        returns:
            str: A CSV string of scraped data
        """
        # Retrieves department list from Cal Poly
        if all_departments:
            top_link = "http://catalog.calpoly.edu/coursesaz/"
            top_soup = scraper_base.get_soup(top_link, ver=False)
            # Changed scraping method because source for visible links changed, but
            # old links are still in the source and cause some 404 errors
            departments_az = top_soup.find('table')
            department_urls = [
                department.get('href')
                for department in departments_az.find_all('a')
                if department.get('href')
            ]
        else:
            department_urls = ['/coursesaz/csc/', '/coursesaz/cpe/']

        scraped_courses = []

        # Retrieves course info for each department
        for department in department_urls:
            # Extracts the department name from the URL
            dep_name = (department.rsplit('/', 2)[1]).upper()

            # Gets raw list of courses and info for department
            dep_link = 'http://catalog.calpoly.edu' + department
            dep_soup = scraper_base.get_soup(dep_link)
            courses = dep_soup.findAll("div", {"class": "courseblock"})

            for course in courses:
                course_name_and_units = (course.find(
                    "p", {"class": "courseblocktitle"})).get_text()
                course_name, course_units = course_name_and_units.splitlines()
                course_units = course_units.split(' ', 1)[0]

                course_terms_and_reqs = (course.find(
                    "div",
                    {"class": "noindent courseextendedwrap"})).get_text()

                section = None
                course_prereqs, course_coreqs, course_conc, course_rec, course_terms = [], [], [], [], []
                for word in course_terms_and_reqs.split():
                    if word.endswith(':'):
                        if word == 'Offered:':
                            section = 'terms'
                        # Last term (F,W,SP, etc) will be appended to the front of "Prerequisite:" or whatever category
                        # comes immediately after terms offered, so "str.endswith(blah)" has to be done instead
                        # of "str == blah"
                        elif word.endswith('Prerequisite:'):
                            try:
                                course_terms.append((word.split('Pre'))[0])
                            except IndexError:
                                pass
                            section = 'prereq'
                        elif word.endswith('Corequisite:'):
                            try:
                                course_terms.append((word.split('Cor'))[0])
                            except IndexError:
                                pass
                            section = 'coreq'
                        elif word.endswith('Concurrent:'):
                            try:
                                course_terms.append((word.split('Con'))[0])
                            except IndexError:
                                pass
                            section = 'conc'
                        elif word.endswith('Recommended:'):
                            try:
                                course_terms.append((word.split('Rec'))[0])
                            except IndexError:
                                pass
                            section = 'rec'
                        else:
                            pass

                    else:
                        if section == 'prereq':
                            course_prereqs.append(word)
                        elif section == 'coreq':
                            course_coreqs.append(word)
                        elif section == 'conc':
                            course_conc.append(word)
                        elif section == 'rec':
                            course_rec.append(word)
                        elif section == 'terms':
                            course_terms.append(word)
                        else:
                            pass

                maybe_join = (lambda x: ' '.join(x) if x else 'NA')
                course_prereqs = maybe_join(course_prereqs)
                course_coreqs = maybe_join(course_coreqs)
                course_conc = maybe_join(course_conc)
                course_rec = maybe_join(course_rec)
                course_terms = maybe_join(course_terms)

                document = {
                    "DEPARTMENT": dep_name,
                    "COURSE_NAME": course_name,
                    "UNITS": course_units,
                    "PREREQUISITES": course_prereqs,
                    "COREQUISITES": course_coreqs,
                    "CONCURRENT": course_conc,
                    "RECOMMENDED": course_rec,
                    "TERMS_TYPICALLY_OFFERED": course_terms
                }

                scraped_courses.append(document)

        return pd.DataFrame(scraped_courses).to_csv(None, index=False)