Esempio n. 1
0
def get_all_ctecs(subject, caesar_scraper=None):
    logging.debug('Starting %s' % subject)
    caesar_scraper = caesar_scraper or CaesarScraper()
    for i, current_class_title in caesar_scraper.get_courses(subject):
        logging.debug("Starting %s %s %s" % (subject, current_class_title, i))
        for j, quarter in caesar_scraper.get_ctecs(subject, i):
            ctec = caesar_scraper.get_ctec(subject, j)
            if ctec == {}:
                logging.error("Could not download %s %s %s %s %s" % (subject, current_catalog_num, quarter, i, j))
                continue

            current_catalog_num = current_class_title.split(":")[0]
            original_catalog_num = "-".join(ctec['class_title'].split()[0].split("-")[0:2])

            ctec['current_class_title'] = current_class_title

            section = ctec['class_title'].split()[0].split("-")[2]
            subj = ctec['subj'].split()[0]
            courses_query = courses.find({"term": quarter, "catalog_num": original_catalog_num, "subject": subj, "section": section})

            if courses_query.count() > 1:
                courses_query = courses.find({"term": quarter, "catalog_num": original_catalog_num, "subject": subj, "section": section, "instructor.name": {"$regex" : ".*".join(ctec['instructor'].split())}})
                if courses_query.count() > 1:
                    logging.error("%s too many courses found for %s %s %s %s %s" % (j, quarter, original_catalog_num, subj, section, ctec['instructor']))
                elif courses_query.count() == 1:
                    logging.error("%s no courses found for %s %s %s %s %s" % (j, quarter, original_catalog_num, subj, section, ctec['instructor']))
            elif courses_query.count() == 0:
                logging.error("%s no courses found for %s %s %s %s" % (j, quarter, original_catalog_num, subj, section))

            if courses_query.count() == 1:
                course = list(courses_query)[0]
                ctec['_id'] = course['_id']
                ctecs.save(ctec)
                logging.debug("Saved %s %s %s %s %s" % (subject, current_catalog_num, quarter, i, j))

            # BUG, if the course starts with 300, CTEC thinks its part of the graduate school
            # for some subjects such as EECS
            if original_catalog_num[0] == "3" or subj != subject:
                caesar_scraper.post_doc(caesar_scraper.CTEC_URL, data={"ICAction": "NW_CT_PB_SRCH_ACAD_CAREER", "NW_CT_PB_SRCH_ACAD_CAREER": "UGRD", "NW_CT_PB_SRCH_SUBJECT": subject, "NW_CT_PB_SRCH_NW_CTEC_SRCH_CHOIC$4$": "C"})
                caesar_scraper.post_doc(caesar_scraper.CTEC_URL, data={"ICAction": "NW_CT_PB_SRCH_SUBJECT", "NW_CT_PB_SRCH_ACAD_CAREER": "UGRD", "NW_CT_PB_SRCH_SUBJECT": subject, "NW_CT_PB_SRCH_NW_CTEC_SRCH_CHOIC$4$": "C"})
                caesar_scraper.get_courses(subject)
                caesar_scraper.get_ctecs(subject, i)
                # sometimes after getting a single ctec, we need to get the courses AND ctecs again
                # because it routes back to the original search page
            elif subj == "AAL" or subj == "AF_AM_ST":
                caesar_scraper.get_ctecs(subject, i)
                # sometimes after getting a single ctec, we need to get ctecs again
                # because it routes back to the list of courses page

        caesar_scraper.get_courses(subject)
        # after getting all the ctecs for a single course, we need to get_courses again
    logging.debug('Finished %s' % subject)
Esempio n. 2
0
from textblob import TextBlob
from models import ctecs

easy_words = ["easy", "stress free", "painless", "little work", "no work", "breeze"]
hard_words = ["hard", "challenging", "difficult"]

for ctec in ctecs.find():
	ctec['easiness'] = sum([ctec['essay'].count(word) for word in easy_words])
	ctec['hardness'] = sum([ctec['essay'].count(word) for word in hard_words])
	blob = TextBlob(ctec['essay'].replace("/", " "))
	ctec['adjectives'] = " ".join([word for word, tag in blob.tags if "JJ" in tag])
	ctecs.save(ctec)
	print ctec['_id']
Esempio n. 3
0
def get_all_ctecs(subject, caesar_scraper=None):
    logging.debug('Starting %s' % subject)
    caesar_scraper = caesar_scraper or CaesarScraper()
    for i, current_class_title in caesar_scraper.get_courses(subject):
        logging.debug("Starting %s %s %s" % (subject, current_class_title, i))
        for j, quarter in caesar_scraper.get_ctecs(subject, i):
            ctec = caesar_scraper.get_ctec(subject, j)
            if ctec == {}:
                logging.error("Could not download %s %s %s %s %s" %
                              (subject, current_catalog_num, quarter, i, j))
                continue

            current_catalog_num = current_class_title.split(":")[0]
            original_catalog_num = "-".join(
                ctec['class_title'].split()[0].split("-")[0:2])

            ctec['current_class_title'] = current_class_title

            section = ctec['class_title'].split()[0].split("-")[2]
            subj = ctec['subj'].split()[0]
            courses_query = courses.find({
                "term": quarter,
                "catalog_num": original_catalog_num,
                "subject": subj,
                "section": section
            })

            if courses_query.count() > 1:
                courses_query = courses.find({
                    "term": quarter,
                    "catalog_num": original_catalog_num,
                    "subject": subj,
                    "section": section,
                    "instructor.name": {
                        "$regex": ".*".join(ctec['instructor'].split())
                    }
                })
                if courses_query.count() > 1:
                    logging.error(
                        "%s too many courses found for %s %s %s %s %s" %
                        (j, quarter, original_catalog_num, subj, section,
                         ctec['instructor']))
                elif courses_query.count() == 1:
                    logging.error("%s no courses found for %s %s %s %s %s" %
                                  (j, quarter, original_catalog_num, subj,
                                   section, ctec['instructor']))
            elif courses_query.count() == 0:
                logging.error(
                    "%s no courses found for %s %s %s %s" %
                    (j, quarter, original_catalog_num, subj, section))

            if courses_query.count() == 1:
                course = list(courses_query)[0]
                ctec['_id'] = course['_id']
                ctecs.save(ctec)
                logging.debug("Saved %s %s %s %s %s" %
                              (subject, current_catalog_num, quarter, i, j))

            # BUG, if the course starts with 300, CTEC thinks its part of the graduate school
            # for some subjects such as EECS
            if original_catalog_num[0] == "3" or subj != subject:
                caesar_scraper.post_doc(
                    caesar_scraper.CTEC_URL,
                    data={
                        "ICAction": "NW_CT_PB_SRCH_ACAD_CAREER",
                        "NW_CT_PB_SRCH_ACAD_CAREER": "UGRD",
                        "NW_CT_PB_SRCH_SUBJECT": subject,
                        "NW_CT_PB_SRCH_NW_CTEC_SRCH_CHOIC$4$": "C"
                    })
                caesar_scraper.post_doc(
                    caesar_scraper.CTEC_URL,
                    data={
                        "ICAction": "NW_CT_PB_SRCH_SUBJECT",
                        "NW_CT_PB_SRCH_ACAD_CAREER": "UGRD",
                        "NW_CT_PB_SRCH_SUBJECT": subject,
                        "NW_CT_PB_SRCH_NW_CTEC_SRCH_CHOIC$4$": "C"
                    })
                caesar_scraper.get_courses(subject)
                caesar_scraper.get_ctecs(subject, i)
                # sometimes after getting a single ctec, we need to get the courses AND ctecs again
                # because it routes back to the original search page
            elif subj == "AAL" or subj == "AF_AM_ST":
                caesar_scraper.get_ctecs(subject, i)
                # sometimes after getting a single ctec, we need to get ctecs again
                # because it routes back to the list of courses page

        caesar_scraper.get_courses(subject)
        # after getting all the ctecs for a single course, we need to get_courses again
    logging.debug('Finished %s' % subject)
Esempio n. 4
0
from textblob import TextBlob
from models import ctecs

easy_words = [
    "easy", "stress free", "painless", "little work", "no work", "breeze"
]
hard_words = ["hard", "challenging", "difficult"]

for ctec in ctecs.find():
    ctec['easiness'] = sum([ctec['essay'].count(word) for word in easy_words])
    ctec['hardness'] = sum([ctec['essay'].count(word) for word in hard_words])
    blob = TextBlob(ctec['essay'].replace("/", " "))
    ctec['adjectives'] = " ".join(
        [word for word, tag in blob.tags if "JJ" in tag])
    ctecs.save(ctec)
    print ctec['_id']