def read_and_stock_dict_course_items_from_webroot_source(self, course_items_source_htmlwebroot_filename=None): ''' 1st read option: Course items are withdrawn from HTML Webroot file source ''' course_items_source_htmlwebroot_filename = self.return_course_items_source_htmlwebroot_filename_or_default_or_raise(course_items_source_htmlwebroot_filename) text = open(course_items_source_htmlwebroot_filename).read() re_find_obj = self.re_compiled_text_to_find.finditer(text) for each_re_found in re_find_obj: course_id = each_re_found.group(1) course_n_seq = each_re_found.group(2) try: # this below should not raise ValueError, if it does, continue without adding it to dict int(course_n_seq) except ValueError: continue if self.unique_course_id_dict.has_key(course_id): continue try: coursera_item_obj = CourseraCourse.objects.get(cid=course_id) #(course_id, course_n_seq) except CourseraCourse.DoesNotExist: coursera_item_obj = CourseraCourse() coursera_item_obj.cid = course_id coursera_item_obj.n_seq = course_n_seq coursera_item_obj.save() self.unique_course_id_dict[course_id] = coursera_item_obj
def scrape(html_text): root = ET.fromstring(html_text) divsL1 = root.iter('div') for divL1 in divsL1: data_course_div = divL1.get('data-course-id') if data_course_div == None: continue divsL2 = data_course_div.iter('div') for divL2 in divsL2: div_with_class_attr = divL2.get('coursera-course-listing-text') if div_with_class_attr == None: continue divsL3 = divL2.iter('div') for divL3 in divsL3: listing_main_div = divL3.get('coursera-course-listing-main') if listing_main_div == None: continue h3_tag = listing_main_div.find('h3') a_tag = h3_tag.find('a') href = a_tag.get('href') pp = href.split('/') cid_n_seq = pp[3] pp = cid_n_seq.split('-') n_seq = pp[-1] cid = '-'.join(pp[:-1]) course = CourseraCourse() course.title = a_tag.text course.cid = cid course.n_seq = int(n_seq) divsL4 = divL3.iter('div') for divL4 in divsL4: listing_progress_div = divL4.get('coursera-course-listing-progress') if listing_progress_div != None: start_date_span = listing_progress_div.find('span') if start_date_span != None: date_text = start_date_span.text pp = date_text.split(' ') month_str = pp[0] day_str = pp[-1] day_str = day_str[:-2] month = timeutils.array_3letter_months_english.index(month_str) month += 1 listing_statement_div = divL4.get('coursera-course-listing-statement') if listing_statement_div != None: outter_div_for_university = listing_statement_div.find('div') university_a_tag = outter_div_for_university.find('a') university_class_attr = university_a_tag.get('class') if university_class_attr != None: # ok, it confirms we're in the right <div /> ! university_name = university_a_tag.text institution = Institution.objects.get(name=university_name) course.institutions.add(institution)
def get_course_or_create_it_or_None(self, cid, n_seq): if n_seq == 0: return None try: ccourse = CourseraCourse.objects.get(cid=cid) except CourseraCourse.DoesNotExist: ccourse = CourseraCourse() ccourse.cid = cid ccourse.n_seq = n_seq ccourse.save() return ccourse if ccourse.n_seq == -1: ccourse.n_seq = n_seq if ccourse.n_seq != n_seq: return None return ccourse
def save_courses_subset_to_db(self): for i, course_subset in enumerate(self.courses_subset): #if course_subset.university == None: #continue try: print str(i+1).zfill(3), 'Saving to db', course_subset except UnicodeEncodeError: print str(i+1).zfill(3), 'Saving to db' course = CourseraCourse() course.cid = course_subset.cid course.n_seq = course_subset.get_n_seq() course.title = course_subset.title if course_subset.start_date != None: course.start_date = course_subset.start_date if course_subset.duration_in_weeks != None: course.duration_in_weeks = course_subset.duration_in_weeks #course.save() if course_subset.university != None: university_name = course_subset.university try: institution = Institution.objects.get(name=university_name) except Institution.DoesNotExist: institution = Institution() institution.name = university_name #institution.courseracourse_id = course.cid institution.save() print 'institution id', institution.id #CourseraCourse.objects.create(name=university_name) except AttributeError, e: print 'university_name', university_name print e #sys.exit(0) pass #course.in #course.institutions.add(institution.id) course.save()
def make_test_course(): course = CourseraCourse() course.cid = 'introstats2' course.n_seq = 1 # '001' course.title = 'Introduction to Statistics' course.description = 'Introduction to Statistics is nice course!' course.start_date = datetime.date(2013, 4, 5) course.duration_in_weeks = 8 # course.workload_in_hours_per_day = 3 course.workload_in_hours_per_week = 3 institution = Institution() institution.id = 10 institution.name = 'Harvard Univ.' course.institutions = [institution] professor = Instructor() professor.id = 10 professor.name = 'John Joey' professor.institution = institution course.instructors = [professor] category = Category() category.id = 10 category.name = 'Mathematics & Statistics' course.categories = [category] print 'course', course print 'Instructors', course.instructors.values() return course