def extract_units(url, headers, file_formats): """ Parses a webpage and extracts its resources e.g. video_url, sub_url, etc. """ #logging.info("Processing '%s'", url) page = get_page_contents(url, headers) page_extractor = get_page_extractor(url) units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats) return units
def get_available_sections(url, headers): """ Extracts the sections and subsections from a given url """ logging.debug("Extracting sections for :" + url) page = get_page_contents(url, headers) page_extractor = get_page_extractor(url) sections = page_extractor.extract_sections_from_html(page, BASE_URL) logging.debug("Extracted sections: " + str(sections)) return sections
def get_courses_info(url, headers): """ Extracts the courses information from the dashboard. """ logging.info('Extracting course information from dashboard.') page = get_page_contents(url, headers) page_extractor = get_page_extractor(url) courses = page_extractor.extract_courses_from_html(page, BASE_URL) logging.debug('Data extracted: %s', courses) return courses
def extract_video_component(args,coursename,headers,soup,section,subsection,unit): video_flag = soup.findAll("div", {"data-block-type": "video"}) video_meta_list = [] for video_comp in video_flag: video_meta = dict() txtjson = video_comp.find('div',{"data-metadata":True})['data-metadata'] txt2dict = json.loads(txtjson) yt_id = re.sub(r"1.00:", '', txt2dict['streams']) yt_link = 'https://youtu.be/'+ yt_id duration = videolen(yt_link) video_meta.update({'section': section , 'subsection': subsection, 'unit_idx': unit, 'youtube_url':yt_link, 'video_duration':duration}) for key, value in txt2dict['transcriptLanguages'].items(): transcript_name = 'transcript_'+ key transcript_url = 'https://courses.edx.org/' + re.sub(r"__lang__",key, txt2dict['transcriptTranslationUrl']) print('download '+ value + ' transcript of '+ yt_link) try: transcript_dump = get_page_contents(transcript_url, headers) transcript_raw = json.loads(transcript_dump) #print (transcript_raw) video_meta.update({transcript_name:transcript_raw['text']}) except (HTTPError,URLError) as exception: print(' bug: cannot download from edx site') transcript_dump = YT_transcript(yt_link,key) if len(transcript_dump) == 0: print(' no transcript available on YouTube') video_meta.update({transcript_name:{"start":'',"end":'',"text":''}}) logging.warn('transcript (error: %s)', exception) errorlog = os.path.join(args.html_dir,coursename,'transcript_error_report.txt') f = open(errorlog, 'a') text = '---------------------------------\n'\ + 'transcript error: ' + str(exception) +'\n' \ + 'video url: '+ yt_link +'\n' \ + 'language: ' + value + '\n' \ + 'section: ' + section + '\n'\ + 'subsection: ' + subsection + '\n'\ + 'unit_idx: ' + unit + '\n' \ +'---------------------------------' f.write(text) f.close() else: print(' transcript was successfuly downloaded from YouTube') video_meta.update({transcript_name:transcript_dump['text']}) video_meta_list.append(video_meta) return video_meta_list
def save_html_to_file(args, selections, all_urls, headers): sub_idx = 0 prob_type_set = [] counter_video = 1 for selected_course, selected_sections in selections.items(): coursename = directory_name(selected_course.name) for selected_section in selected_sections: section_dirname = "%02d-%s" % (selected_section.position, selected_section.name) target_dir = os.path.join(args.html_dir, coursename, clean_filename(section_dirname)) mkdir_p(target_dir) for subsection in selected_section.subsections: if subsection.name == None: subsection.name = 'Untitled' target_subdir = os.path.join( target_dir, str(sub_idx).zfill(3) + '-' + clean_filename(subsection.name)) mkdir_p(target_subdir) logging.info('url: ' + str(all_urls[sub_idx]) + ', subsection: ' + str(sub_idx).zfill(3) + '-' + str(subsection.name)) page = get_page_contents(str(all_urls[sub_idx]), headers) soup = BeautifulSoup(page, "html.parser") #div contains all units (seq_contents_#) main_content = soup.find("div", {"class": "container"}) units = crawl_units(main_content) counter = 0 sub_idx = sub_idx + 1 for unit in units: filename_template = "seq_contents_" + str( counter) + ".html" filename = os.path.join(target_subdir, filename_template) filename_template_txt = "seq_contents_" + str( counter) + ".txt" filename_txt = os.path.join(target_subdir, filename_template_txt) filename_template_prob_txt = "seq_contents_" + str( counter) + "_prob.txt" filename_prob_txt = os.path.join( target_subdir, filename_template_prob_txt) filename_template_video_json = "seq_contents_" + str( counter) + "_vdo.json" filename_video_json = os.path.join( target_subdir, filename_template_video_json) logging.info('path: ' + str(target_subdir) + ', filename: ' + str(filename)) try: file_ = sys.stdout if filename == '-' else codecs.open( filename, 'w', 'utf-8') except IOError as exc: f = open('downloading_error_report.txt', 'a') text = 'External command error ignored: ' + str( exc) + '\n\n' f.write(text) f.close() file_ = sys.stdout if filename == '-' else codecs.open( filename_template, 'w', 'utf-8') file_.writelines(unit.prettify(formatter=None)) file_.close() soup = unit.prettify(formatter=None) soup = BeautifulSoup(soup, "html.parser") # select only html componert (disregard video, problem) html_flag = soup.findAll("div", {"data-block-type": "html"}) if len(html_flag) > 0: #create file only when html component exists file_txt = sys.stdout if filename_txt == '-' else codecs.open( filename_txt, 'w', 'utf-8') text = "" for soup_component in html_flag: for s in soup_component.findAll([ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li' ]): text += s.getText() + " " file_txt.writelines(text) file_txt.close() print(filename_txt + ' of text component was created') # select only problem componert (disregard video, text) prob_txt, prob_types = extract_problem_comp(soup) if len(prob_txt) > 0: file_prob_txt = sys.stdout if filename == '-' else codecs.open( filename_prob_txt, 'w', 'utf-8') for prob_type in prob_types: prob_type_set.append(prob_type + ' \n') file_prob_txt.writelines(prob_txt) file_prob_txt.close() print(filename_prob_txt + ' of problem component was created') tmp_video_dict = extract_video_component( args, coursename, headers, soup, clean_filename(section_dirname), clean_filename(subsection.name), "seq_contents_" + str(counter)) if len(tmp_video_dict) > 0: file_video_json = sys.stdout if filename == '-' else codecs.open( filename_video_json, 'w', 'utf-8') video_unit_dict = dict() for vd in tmp_video_dict: video_unit_dict.update({ "video_block_" + str(counter_video).zfill(2): vd }) counter_video += 1 video_dict2json = json.dumps(video_unit_dict, sort_keys=False, indent=4, separators=(',', ': ')) file_video_json.writelines(video_dict2json) file_video_json.close() print(filename_video_json + ' of video component was created') counter += 1 save_urls_to_file( prob_type_set, os.path.join(args.html_dir, coursename, "all_prob_type.txt"))
def save_html_to_file(args, selections, all_urls, headers): sub_idx = 0 prob_type_set = [] counter_video = 1 counter_unit = 1 txt_id = 1 prob_id = 1 video_id = 1 comp_id = 1 tmp_course_strut = dict() txt_dict_ls = dict() prob_dict_ls = dict() comp_dict_ls = dict() video_dict_ls = dict() for selected_course, selected_sections in selections.items(): coursename = directory_name(selected_course.name) sourcepath = os.path.join(args.html_dir, coursename, 'source_html_file') mkdir_p(sourcepath) #filename_meta = os.path.join(sourcepath, 'html_metadata.csv') metasec_ls = [[], [], [], []] for selected_section in selected_sections: section_dirname = "%02d-%s" % (selected_section.position, selected_section.name) tmp_course_strut['section'] = (section_dirname) for subsection in selected_section.subsections: if subsection.name == None: subsection.name = 'Untitled' tmp_course_strut['subsection'] = (subsection.name) #logging.info('url: '+ str(all_urls[sub_idx]) ) print(all_urls[sub_idx]) page = get_page_contents(str(all_urls[sub_idx]), headers) soup = BeautifulSoup(page, "html.parser") #div contains all units (seq_contents_#) main_content = soup.find("div", {"class": "container"}) units = crawl_units(main_content) sub_idx = sub_idx + 1 for idx, unit in enumerate(units): filename_template = str(counter_unit).zfill(4) + ".html" filename = os.path.join(args.html_dir, coursename, 'source_html_file', filename_template) try: file_ = sys.stdout if filename == '-' else codecs.open( filename, 'w', 'utf-8') except IOError as exc: f = open('downloading_error_report.txt', 'a') text = 'External command error ignored: ' + str( exc) + '\n\n' f.write(text) f.close() file_ = sys.stdout if filename == '-' else codecs.open( filename_template, 'w', 'utf-8') file_.writelines(unit.prettify(formatter=None)) file_.close() soup = unit.prettify(formatter=None) soup = BeautifulSoup(soup, "html.parser") cur_unit = soup.find("h2", { "class": "hd hd-2 unit-title" }).getText() if cur_unit == None: cur_unit = 'Untitled' tmp_course_strut['unit'] = (cur_unit) logging.info('section: ' + tmp_course_strut['section']) logging.info(' subsection: ' + tmp_course_strut['subsection']) logging.info(' unit: ' + tmp_course_strut['unit']) metasec_ls[0].append(tmp_course_strut['section']) metasec_ls[1].append(tmp_course_strut['subsection']) metasec_ls[2].append(tmp_course_strut['unit']) metasec_ls[3].append(filename_template) # select only html componert (disregard video, problem) html_flag = soup.findAll("div", {"data-block-type": "html"}) if len(html_flag) > 0: #create file only when html component exists text = "" for soup_component in html_flag: for s in soup_component.findAll([ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li' ]): text += s.getText() + " " tmp_dict = { 'text_block_' + str(txt_id).zfill(4): { 'section': tmp_course_strut['section'], 'subsection': tmp_course_strut['subsection'], 'unit': tmp_course_strut['unit'], 'content': text } } txt_dict_ls.update(tmp_dict) txt_id += 1 # select only problem componert (disregard video, text) prob_txt, prob_types = extract_problem_comp(soup) if len(prob_txt) > 0: for prob_type in prob_types: prob_type_set.append(prob_type + ' \n') tmp_dict = { 'quiz_block_' + str(prob_id).zfill(4): { 'section': tmp_course_strut['section'], 'subsection': tmp_course_strut['subsection'], 'unit': tmp_course_strut['unit'], 'content': prob_txt } } prob_dict_ls.update(tmp_dict) #print(tmp_dict) prob_id += 1 tmp_video_dict = extract_video_component( args, coursename, headers, soup, tmp_course_strut['section'], tmp_course_strut['subsection'], tmp_course_strut['unit']) if len(tmp_video_dict) > 0: video_unit_dict = dict() for vd in tmp_video_dict: video_unit_dict.update({ "video_block_" + str(counter_video).zfill(4): vd }) counter_video += 1 video_dict_ls.update(video_unit_dict) video_id += 1 print(video_dict_ls) counter_unit += 1 set_comp_types = soup.findAll("div", {"data-block-type": True}) for comp_type in set_comp_types: if comp_type['data-block-type'] in [ 'html', 'video', 'problem' ]: comp_dict = { str(comp_id).zfill(4) + '_' + comp_type['data-block-type']: { 'section': tmp_course_strut['section'], 'subsection': tmp_course_strut['subsection'], 'unit': tmp_course_strut['unit'], 'type': comp_type['data-block-type'] } } comp_dict_ls.update(comp_dict) comp_id += 1 txt_dict2json = json.dumps(txt_dict_ls, sort_keys=True, indent=4, separators=(',', ': ')) prob_dict2json = json.dumps(prob_dict_ls, sort_keys=True, indent=4, separators=(',', ': ')) video_dict2json = json.dumps(video_dict_ls, sort_keys=True, indent=4, separators=(',', ': ')) comp_dict2json = json.dumps(comp_dict_ls, sort_keys=True, indent=4, separators=(',', ': ')) with open(os.path.join(args.html_dir, coursename, 'all_textcomp.json'), 'w', encoding='utf-8') as f: f.write(txt_dict2json) with open(os.path.join(args.html_dir, coursename, 'all_probcomp.json'), 'w', encoding='utf-8') as f: f.write(prob_dict2json) with open(os.path.join(args.html_dir, coursename, 'all_videocomp.json'), 'w', encoding='utf-8') as f: f.write(video_dict2json) with open(os.path.join(args.html_dir, coursename, 'all_comp.json'), 'w', encoding='utf-8') as f: f.write(comp_dict2json) metafile_dict = { 'section': metasec_ls[0], 'subsection': metasec_ls[1], 'unit': metasec_ls[2], 'htmlfile': metasec_ls[3] } df = pd.DataFrame.from_dict(metafile_dict) df.to_csv( os.path.join(args.html_dir, coursename, 'source_html_file', 'metadata.csv')) save_urls_to_file( prob_type_set, os.path.join(args.html_dir, coursename, "all_prob_type.txt")) make_tarfile(os.path.join(args.html_dir, coursename, 'sourcefile.tar.gz'), os.path.join(args.html_dir, coursename, 'source_html_file'))
def extract_video_component(args, coursename, headers, soup, section, subsection, unit): video_flag = soup.findAll("div", {"data-block-type": "video"}) video_meta_list = [] for video_comp in video_flag: video_meta = dict() video = video_comp.find('div', {"data-metadata": True}) txtjson = video['data-metadata'] edx_video_id = video['id'] txt2dict = json.loads(txtjson) start_time = txt2dict['start'] yt_id = re.sub(r"1.00:", '', txt2dict['streams']) if len(txt2dict['streams']) == 0: duration = txt2dict['duration'] yt_link = 'n/a' video_source = [i for i in txt2dict['sources']] if duration == 0: try: duration = extract_duration_from_non_YT_video( video_source[0], headers) except (HTTPError, URLError) as exception: print(' bug: cannot download video from edx site') duration = 'n/a' video_meta.update({ 'section': section, 'subsection': subsection, 'unit': unit, 'youtube_url': yt_link, 'video_source': video_source[0], 'video_duration': duration, 'video_id': edx_video_id, 'start': start_time }) else: yt_link = 'https://youtu.be/' + yt_id duration = videolen(yt_link) video_source = 'n/a' if duration == 0: duration = txt2dict['duration'] video_meta.update({ 'section': section, 'subsection': subsection, 'unit': unit, 'youtube_url': yt_link, 'video_source': video_source, 'video_duration': duration, 'video_id': edx_video_id, 'start': start_time }) for key, value in txt2dict['transcriptLanguages'].items(): transcript_name = 'transcript_' + key transcript_url = BASE_URL + '/' + re.sub( r"__lang__", key, txt2dict['transcriptTranslationUrl']) if yt_link == 'n/a': print('download ' + value + ' transcript of ' + video_source[0]) else: print('download ' + value + ' transcript of ' + yt_link) try: transcript_dump = get_page_contents(transcript_url, headers) transcript_raw = json.loads(transcript_dump) #print (transcript_raw) speech_period = extract_speech_period(transcript_raw['start'], transcript_raw['end']) speech_times = extract_speech_times(transcript_raw['start'], transcript_raw['end']) video_meta.update({ transcript_name: transcript_raw['text'], 'speech_period': speech_period, 'speech_times': speech_times }) except (HTTPError, URLError) as exception: print(' bug: cannot download transcript from edx site') if yt_link == 'n/a': video_meta.update({ transcript_name: { "start": '', "end": '', "text": '' }, 'speech_period': 'n/a' }) logging.warning('transcript (error: %s)', exception) errorlog = os.path.join(args.html_dir, coursename, 'transcript_error_report.txt') f = open(errorlog, 'a') text = '---------------------------------\n'\ + 'transcript error: ' + str(exception) +'\n' \ + 'video file: '+ video_source[0] +'\n' \ + 'language: ' + value + '\n' \ + 'section: ' + section + '\n'\ + 'subsection: ' + subsection + '\n'\ + 'unit_idx: ' + unit + '\n' \ +'---------------------------------' f.write(text) f.close() continue print(' attempt to download transcript on Youtube') transcript_raw = YT_transcript(yt_link, key) if len(transcript_raw) == 0: print(' no transcript available on YouTube') video_meta.update({ transcript_name: { "start": '', "end": '', "text": '' }, 'speech_period': 'n/a' }) logging.warning('transcript (error: %s)', exception) errorlog = os.path.join(args.html_dir, coursename, 'transcript_error_report.txt') f = open(errorlog, 'a') text = '---------------------------------\n'\ + 'transcript error: ' + str(exception) +'\n' \ + 'video url: '+ yt_link +'\n' \ + 'language: ' + value + '\n' \ + 'section: ' + section + '\n'\ + 'subsection: ' + subsection + '\n'\ + 'unit_idx: ' + unit + '\n' \ +'---------------------------------' f.write(text) f.close() else: print( ' transcript was successfuly downloaded from YouTube' ) speech_period = extract_speech_period( transcript_raw['start'], transcript_raw['end']) video_meta.update({ transcript_name: transcript_raw['text'], 'speech_period': speech_period }) video_meta_list.append(video_meta) return video_meta_list