def group_ctecs_by_course_id(): grouped_course_ctecs = defaultdict(list) for ctec in ctecs.find(): course = courses.find_one({"_id": ctec["_id"]}) course_ctec = dict(course) course_ctec.update(ctec) course_ctec['id'] = course_ctec.pop('_id') key = str(course_ctec['course_id']) grouped_course_ctecs[key].append(course_ctec) return grouped_course_ctecs
from textblob import TextBlob from models import ctecs easy_words = ["easy", "stress free", "painless", "little work", "no work", "breeze"] hard_words = ["hard", "challenging", "difficult"] for ctec in ctecs.find(): ctec['easiness'] = sum([ctec['essay'].count(word) for word in easy_words]) ctec['hardness'] = sum([ctec['essay'].count(word) for word in hard_words]) blob = TextBlob(ctec['essay'].replace("/", " ")) ctec['adjectives'] = " ".join([word for word, tag in blob.tags if "JJ" in tag]) ctecs.save(ctec) print ctec['_id']
from models import ctecs, courses, terms for ctec in ctecs.find({'corrected_essay': {'$exists': True}}): ctec['subj'] = ctec['subj'].split()[0] print unicode("[{academic_term}] - {subj} {class_title}").format(**ctec) print ctec['essay'] print "-----------------------------" # from utils import group_ctecs_by_course_id # grouped_course_ctecs = group_ctecs_by_course_id() # for course_id, course_ctecs in grouped_course_ctecs.iteritems(): # with open("temp/%s.txt" % course_id, "w") as f: # for course_ctec in course_ctecs: # f.write(course_ctec['title']) # f.write("\n") # f.write(course_ctec['essay'].encode('utf-8')) # f.write("\n\n")
from models import ctecs, courses import csv course_fieldnames = ["id", "term", "year", "quarter", "course_id", "class_num", "school", "subject", "catalog_num", "section", "title", "instructor", "start_time", "end_time", "meeting_days"] ctec_fieldnames = ["enrollment_count", "response_count", "question0_average_rating", "question1_average_rating", "question2_average_rating", "question3_average_rating", "question4_average_rating", "easiness", "hardness", "essay"] fieldnames = course_fieldnames + ctec_fieldnames with open("ctecs.csv", "w") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for ctec in ctecs.find(): course = courses.find_one({"_id": ctec["_id"]}) course_ctec = dict(course) course_ctec.update(ctec) course_ctec['id'] = course_ctec.pop('_id') course_ctec['essay'] = course_ctec['essay'].encode('utf-8') course_ctec['year'] = course_ctec['term'].split()[0] course_ctec['quarter'] = course_ctec['term'].split()[1] writer.writerow({k:v for k,v in course_ctec.iteritems() if k in fieldnames}) # print course_ctec['id'], course_ctec['term'], course_ctec['catalog_num'], course_ctec['instructor']