def fill_in_cid_and_n_seq_dict_from_courseras_webrootpage(self): """ The url-stock comes directly from a text file. This text file, in turn, is produced by CourseraWebRootPageScraperMod.py ie, they are the courses listed in Coursera's Web Root Course Index Page """ # tuplestextfile = os.path.join(ls.get_coursera_app_data_dir_abspath(), 'Coursera tuples courseid and seq.txt') tuplestextfile = ls.get_default_textfile_with_extracted_ids_and_nseqs_from_coursera_webrootpage_abspath() tuplelines = open(tuplestextfile).read() lines = tuplelines.split("\n") for line in lines: if line.startswith("#"): continue try: pp = line.split(",") course_id = pp[0] if course_id in WorkCourse.finished_course_ids_list: print "Course_id", course_id, "is finished. Continuing next." continue n_seq = int(pp[1]) if n_seq == 0: continue except IndexError: continue except ValueError: continue self.cid_and_n_seq_dict_from_courseras_webrootpage[course_id] = n_seq print "Total courses found at coursera's webrootpage:", len(self.cid_and_n_seq_dict_from_courseras_webrootpage)
def write_to_txtfile_current_stocked_coursera_items(self, txt_filename=None): ''' Write stocked course items to a txt file source ''' n_items = len(self.course_tuple_list) if n_items == 0: return textfile_with_ids_and_nseqs_abspath = ls.get_default_textfile_with_extracted_ids_and_nseqs_from_coursera_webrootpage_abspath() print 'Writing %d lines to %s' %(n_items, textfile_with_ids_and_nseqs_abspath) fileobj = open(textfile_with_ids_and_nseqs_abspath, 'w') for tuple_item in self.course_tuple_list: # the 1st tuple element is course_id, the second is an object with at least attributes course_id and course_n_seq course_item_obj = tuple_item[1] line = '%(course_id)s,%(course_n_seq)s' %{'course_id':course_item_obj.cid, 'course_n_seq':course_item_obj.n_seq} fileobj.write(line + '\n') fileobj.close()
def process(): extractor = CourseraWebRootCourseExtractor() print 'Extracting courses from webroot, please wait.' extractor.restart_items_by_reading_htmlwebroot_source() print 'Writing found courses to', ls.get_default_textfile_with_extracted_ids_and_nseqs_from_coursera_webrootpage_abspath() extractor.write_to_txtfile_current_stocked_coursera_items()