def get_all_ctecs(subject, caesar_scraper=None): logging.debug('Starting %s' % subject) caesar_scraper = caesar_scraper or CaesarScraper() for i, current_class_title in caesar_scraper.get_courses(subject): logging.debug("Starting %s %s %s" % (subject, current_class_title, i)) for j, quarter in caesar_scraper.get_ctecs(subject, i): ctec = caesar_scraper.get_ctec(subject, j) if ctec == {}: logging.error("Could not download %s %s %s %s %s" % (subject, current_catalog_num, quarter, i, j)) continue current_catalog_num = current_class_title.split(":")[0] original_catalog_num = "-".join(ctec['class_title'].split()[0].split("-")[0:2]) ctec['current_class_title'] = current_class_title section = ctec['class_title'].split()[0].split("-")[2] subj = ctec['subj'].split()[0] courses_query = courses.find({"term": quarter, "catalog_num": original_catalog_num, "subject": subj, "section": section}) if courses_query.count() > 1: courses_query = courses.find({"term": quarter, "catalog_num": original_catalog_num, "subject": subj, "section": section, "instructor.name": {"$regex" : ".*".join(ctec['instructor'].split())}}) if courses_query.count() > 1: logging.error("%s too many courses found for %s %s %s %s %s" % (j, quarter, original_catalog_num, subj, section, ctec['instructor'])) elif courses_query.count() == 1: logging.error("%s no courses found for %s %s %s %s %s" % (j, quarter, original_catalog_num, subj, section, ctec['instructor'])) elif courses_query.count() == 0: logging.error("%s no courses found for %s %s %s %s" % (j, quarter, original_catalog_num, subj, section)) if courses_query.count() == 1: course = list(courses_query)[0] ctec['_id'] = course['_id'] ctecs.save(ctec) logging.debug("Saved %s %s %s %s %s" % (subject, current_catalog_num, quarter, i, j)) # BUG, if the course starts with 300, CTEC thinks its part of the graduate school # for some subjects such as EECS if original_catalog_num[0] == "3" or subj != subject: caesar_scraper.post_doc(caesar_scraper.CTEC_URL, data={"ICAction": "NW_CT_PB_SRCH_ACAD_CAREER", "NW_CT_PB_SRCH_ACAD_CAREER": "UGRD", "NW_CT_PB_SRCH_SUBJECT": subject, "NW_CT_PB_SRCH_NW_CTEC_SRCH_CHOIC$4$": "C"}) caesar_scraper.post_doc(caesar_scraper.CTEC_URL, data={"ICAction": "NW_CT_PB_SRCH_SUBJECT", "NW_CT_PB_SRCH_ACAD_CAREER": "UGRD", "NW_CT_PB_SRCH_SUBJECT": subject, "NW_CT_PB_SRCH_NW_CTEC_SRCH_CHOIC$4$": "C"}) caesar_scraper.get_courses(subject) caesar_scraper.get_ctecs(subject, i) # sometimes after getting a single ctec, we need to get the courses AND ctecs again # because it routes back to the original search page elif subj == "AAL" or subj == "AF_AM_ST": caesar_scraper.get_ctecs(subject, i) # sometimes after getting a single ctec, we need to get ctecs again # because it routes back to the list of courses page caesar_scraper.get_courses(subject) # after getting all the ctecs for a single course, we need to get_courses again logging.debug('Finished %s' % subject)
from textblob import TextBlob from models import ctecs easy_words = ["easy", "stress free", "painless", "little work", "no work", "breeze"] hard_words = ["hard", "challenging", "difficult"] for ctec in ctecs.find(): ctec['easiness'] = sum([ctec['essay'].count(word) for word in easy_words]) ctec['hardness'] = sum([ctec['essay'].count(word) for word in hard_words]) blob = TextBlob(ctec['essay'].replace("/", " ")) ctec['adjectives'] = " ".join([word for word, tag in blob.tags if "JJ" in tag]) ctecs.save(ctec) print ctec['_id']
def get_all_ctecs(subject, caesar_scraper=None): logging.debug('Starting %s' % subject) caesar_scraper = caesar_scraper or CaesarScraper() for i, current_class_title in caesar_scraper.get_courses(subject): logging.debug("Starting %s %s %s" % (subject, current_class_title, i)) for j, quarter in caesar_scraper.get_ctecs(subject, i): ctec = caesar_scraper.get_ctec(subject, j) if ctec == {}: logging.error("Could not download %s %s %s %s %s" % (subject, current_catalog_num, quarter, i, j)) continue current_catalog_num = current_class_title.split(":")[0] original_catalog_num = "-".join( ctec['class_title'].split()[0].split("-")[0:2]) ctec['current_class_title'] = current_class_title section = ctec['class_title'].split()[0].split("-")[2] subj = ctec['subj'].split()[0] courses_query = courses.find({ "term": quarter, "catalog_num": original_catalog_num, "subject": subj, "section": section }) if courses_query.count() > 1: courses_query = courses.find({ "term": quarter, "catalog_num": original_catalog_num, "subject": subj, "section": section, "instructor.name": { "$regex": ".*".join(ctec['instructor'].split()) } }) if courses_query.count() > 1: logging.error( "%s too many courses found for %s %s %s %s %s" % (j, quarter, original_catalog_num, subj, section, ctec['instructor'])) elif courses_query.count() == 1: logging.error("%s no courses found for %s %s %s %s %s" % (j, quarter, original_catalog_num, subj, section, ctec['instructor'])) elif courses_query.count() == 0: logging.error( "%s no courses found for %s %s %s %s" % (j, quarter, original_catalog_num, subj, section)) if courses_query.count() == 1: course = list(courses_query)[0] ctec['_id'] = course['_id'] ctecs.save(ctec) logging.debug("Saved %s %s %s %s %s" % (subject, current_catalog_num, quarter, i, j)) # BUG, if the course starts with 300, CTEC thinks its part of the graduate school # for some subjects such as EECS if original_catalog_num[0] == "3" or subj != subject: caesar_scraper.post_doc( caesar_scraper.CTEC_URL, data={ "ICAction": "NW_CT_PB_SRCH_ACAD_CAREER", "NW_CT_PB_SRCH_ACAD_CAREER": "UGRD", "NW_CT_PB_SRCH_SUBJECT": subject, "NW_CT_PB_SRCH_NW_CTEC_SRCH_CHOIC$4$": "C" }) caesar_scraper.post_doc( caesar_scraper.CTEC_URL, data={ "ICAction": "NW_CT_PB_SRCH_SUBJECT", "NW_CT_PB_SRCH_ACAD_CAREER": "UGRD", "NW_CT_PB_SRCH_SUBJECT": subject, "NW_CT_PB_SRCH_NW_CTEC_SRCH_CHOIC$4$": "C" }) caesar_scraper.get_courses(subject) caesar_scraper.get_ctecs(subject, i) # sometimes after getting a single ctec, we need to get the courses AND ctecs again # because it routes back to the original search page elif subj == "AAL" or subj == "AF_AM_ST": caesar_scraper.get_ctecs(subject, i) # sometimes after getting a single ctec, we need to get ctecs again # because it routes back to the list of courses page caesar_scraper.get_courses(subject) # after getting all the ctecs for a single course, we need to get_courses again logging.debug('Finished %s' % subject)
from textblob import TextBlob from models import ctecs easy_words = [ "easy", "stress free", "painless", "little work", "no work", "breeze" ] hard_words = ["hard", "challenging", "difficult"] for ctec in ctecs.find(): ctec['easiness'] = sum([ctec['essay'].count(word) for word in easy_words]) ctec['hardness'] = sum([ctec['essay'].count(word) for word in hard_words]) blob = TextBlob(ctec['essay'].replace("/", " ")) ctec['adjectives'] = " ".join( [word for word, tag in blob.tags if "JJ" in tag]) ctecs.save(ctec) print ctec['_id']