def find_all_download_links(client, menu_links, url, save=True): dl_list = DownloadList(url) for chapter, menu in menu_links.items(): print(' scanning "{}"'.format(chapter)) for subheading, href in menu.items(): print(' in "{}"'.format(subheading)) r = client.get(href) soup = BeautifulSoup(r.text, 'html.parser') # get the number of sections for this subheading (subheading) num_seq = len(soup.select('#sequence-list > li')) print(" {} sections".format(num_seq)) # iterate through the sections to find links for i in range(num_seq): # unescape section contents to make it parseable by BeautifulSoup seq_contents = [unescape(x.text.replace("'", '')) for x in \ soup.select('#seq_contents_{}'.format(i))] seq = BeautifulSoup(seq_contents[0], 'html.parser') # If a video exists, add the link video = seq.select('.video-download-button > a') if len(video) >= 1: dl_list.append([chapter, subheading, 'section_{}'.format(i), video[0]['href'], "main_content"]) # Get all links in the section body area links = seq.find_all('a') for link in links: link = link['href'] if link.endswith('.download'): link = link.rstrip('.download') if link[:4] != 'http': link = 'https://courses.edx.org' + link if link.endswith(tuple(FILE_TYPES)): dl_list.append([chapter, subheading, 'section_{}'.format(i), link]) print('.', end='') if save: fn = os.path.join(os.getcwd(), dl_list.course + '_links.pkl') with open(fn, 'wb') as fh: pickle.dump(dl_list, fh) return dl_list
def build_menu_item_links(soup): link_map = {} chapters = soup.find_all("div", class_="chapter-content-container") for chapter in chapters: heading = chapter['id'].rstrip('-child') menu_items = {} items = chapter.find_all("div", class_="menu-item") for item in items: subheading = item.p.text.strip().replace(' ', '_') subheading = DownloadList.replace_punctuation(subheading) href = item.a['href'] href = 'https://courses.edx.org/' + href if 'http' not in href else href menu_items[subheading] = href link_map[heading] = menu_items return link_map
def find_all_download_links(client, menu_links, url, save=True): dl_list = DownloadList(url) for chapter, menu in menu_links.items(): print(' scanning "{}"'.format(chapter)) for subheading, href in menu.items(): print(' in "{}"'.format(subheading)) r = client.get(href) soup = BeautifulSoup(r.text, 'html.parser') # get the number of sections for this subheading (subheading) num_seq = len(soup.select('#sequence-list > li')) print(" {} sections".format(num_seq)) # iterate through the sections to find links for i in range(num_seq): # unescape section contents to make it parseable by BeautifulSoup seq_contents = [unescape(x.text.replace("'", '')) for x in \ soup.select('#seq_contents_{}'.format(i))] seq = BeautifulSoup(seq_contents[0], 'html.parser') # If a video exists, add the link video = seq.select('.video-download-button > a') if len(video) >= 1: dl_list.append([ chapter, subheading, 'section_{}'.format(i), video[0]['href'], "main_content" ]) # Get all links in the section body area links = seq.find_all('a') for link in links: link = link['href'] if link.endswith('.download'): link = link.rstrip('.download') if link[:4] != 'http': link = 'https://courses.edx.org' + link dl_list.append( [chapter, subheading, 'section_{}'.format(i), link]) print('.', end='') if save: fn = os.path.join(os.getcwd(), dl_list.course + '_links.pkl') with open(fn, 'wb') as fh: pickle.dump(dl_list, fh) return dl_list
def prompts(url=None): email = input("Enter your edX account email: ") password = gpass("Enter your edX password: "******"Course url: ")) return email, password, url
'Enter the course number you\'d like to load, or "q" to quit: ') if prompt == 'q' or int(prompt) in range(0, i + 1): break if prompt == 'q': sys.exit() else: email, password, url = prompts(pkl_files[int(prompt)].url) saved_list = pkl_files[int(prompt)] else: if len(sys.argv) < 2: email, password, url = prompts() else: email, password, url = sys.argv[1:] # check cached DownloadLists for pkl in pkl_files: if url == pkl.url or DownloadList.course_name(url) == pkl.course: print("\nA list of download links already exits for this course." "\nDo you want to use it?") prompt = None while prompt not in ['y', 'n']: prompt = input( " Enter 'y' if yes, 'n' if you'd like to scrape all links again: " ) if prompt == 'y': saved_list = pkl break # push the red button run(email, password, url, saved_list)
print("{}. {}\t".format(i, pkl.course)) while True: prompt = input('Enter the course number you\'d like to load, or "q" to quit: ') if prompt == 'q' or int(prompt) in range(0, i+1): break if prompt == 'q': sys.exit() else: email, password, url = prompts(pkl_files[int(prompt)].url) saved_list = pkl_files[int(prompt)] else: if len(sys.argv) < 2: email, password, url = prompts() else: email, password, url = sys.argv[1:] # check cached DownloadLists for pkl in pkl_files: if url == pkl.url or DownloadList.course_name(url) == pkl.course: print("\nA list of download links already exits for this course." "\nDo you want to use it?") prompt = None while prompt not in ['y', 'n']: prompt = input(" Enter 'y' if yes, 'n' if you'd like to scrape all links again: ") if prompt == 'y': saved_list = pkl break # push the red button run(email, password, url, saved_list)
def test_replace_punctuation(self): self.assertEqual(DownloadList.replace_punctuation(''), '') self.assertEqual(DownloadList.replace_punctuation('123'), '123') self.assertEqual(DownloadList.replace_punctuation(string.punctuation), '--------------------------_-----')
def test_course_name(self): self.assertEqual('course-v1-MITx-6-00-2x_4-3T2015', DownloadList.course_name('https://courses.edx.org/courses/course-v1:MITx+6.00.2x_4+3T2015/courseware/8d9a47872ed641a1ace050f1c1ba7ac7/'))
def test_check_url(self): self.assertEqual('https://courses.edx.org/courses/course-v1:MITx+6.00.2x_4+3T2015/courseware/8d9a47872ed641a1ace050f1c1ba7ac7/', DownloadList.check_url('https://courses.edx.org/courses/course-v1:MITx+6.00.2x_4+3T2015/courseware/8d9a47872ed641a1ace050f1c1ba7ac7/')) with self.assertRaises(ValueError): DownloadList.check_url('') DownloadList.check_url('http://example.com')
#!/usr/bin/env python3 import os, sys, pickle from getpass import getpass as gpass from downedx import run from dl_list import DownloadList saved_list = None dirfiles = os.listdir(os.getcwd()) # collect argument from the command line if len(sys.argv) < 2: email = input("Enter your edX account email: ") password = gpass("Enter your edX password: "******"Course url: ")) else: email = sys.argv[1] password = sys.argv[2] url = DownloadList.check_url(sys.argv[3]) # load cached/pickled dl_link lists pkl_files = [x for x in dirfiles if x.endswith('.pkl')] if len(pkl_files) > 0: for pkl_file in pkl_files: with open(pkl_file, 'rb') as fh: pkl = pickle.load(fh) if url == pkl.url: print("\nA list of download links already exits for this course." "\nDo you want to use it?") prompt = None