Example #1
0
def find_all_download_links(client, menu_links, url, save=True):
    dl_list = DownloadList(url)
    for chapter, menu in menu_links.items():
        print('    scanning "{}"'.format(chapter))
        for subheading, href in menu.items():
            print('      in "{}"'.format(subheading))
            r = client.get(href)
            soup = BeautifulSoup(r.text, 'html.parser')

            # get the number of sections for this subheading (subheading)
            num_seq = len(soup.select('#sequence-list > li'))
            print("        {} sections".format(num_seq))

            # iterate through the sections to find links
            for i in range(num_seq):
                # unescape section contents to make it parseable by BeautifulSoup
                seq_contents = [unescape(x.text.replace("'", '')) for x in \
                                soup.select('#seq_contents_{}'.format(i))]
                seq = BeautifulSoup(seq_contents[0], 'html.parser')

                # If a video exists, add the link
                video = seq.select('.video-download-button > a')
                if len(video) >= 1:
                    dl_list.append([chapter, subheading, 'section_{}'.format(i), video[0]['href'], "main_content"])

                # Get all links in the section body area
                links = seq.find_all('a')
                for link in links:
                    link = link['href']
                    if link.endswith('.download'): link = link.rstrip('.download')
                    if link[:4] != 'http': link = 'https://courses.edx.org' + link
                    if link.endswith(tuple(FILE_TYPES)):
                        dl_list.append([chapter, subheading, 'section_{}'.format(i), link])
                        print('.', end='')
    if save:
        fn = os.path.join(os.getcwd(), dl_list.course + '_links.pkl')
        with open(fn, 'wb') as fh:
            pickle.dump(dl_list, fh)
    return dl_list
Example #2
0
def build_menu_item_links(soup):
    link_map = {}

    chapters = soup.find_all("div", class_="chapter-content-container")
    for chapter in chapters:
        heading = chapter['id'].rstrip('-child')

        menu_items = {}
        items = chapter.find_all("div", class_="menu-item")
        for item in items:
            subheading = item.p.text.strip().replace(' ', '_')
            subheading = DownloadList.replace_punctuation(subheading)
            href = item.a['href']
            href = 'https://courses.edx.org/' + href if 'http' not in href else href
            menu_items[subheading] = href

        link_map[heading] = menu_items

    return link_map
Example #3
0
def build_menu_item_links(soup):
    link_map = {}

    chapters = soup.find_all("div", class_="chapter-content-container")
    for chapter in chapters:
        heading = chapter['id'].rstrip('-child')

        menu_items = {}
        items = chapter.find_all("div", class_="menu-item")
        for item in items:
            subheading = item.p.text.strip().replace(' ', '_')
            subheading = DownloadList.replace_punctuation(subheading)
            href = item.a['href']
            href = 'https://courses.edx.org/' + href if 'http' not in href else href
            menu_items[subheading] = href

        link_map[heading] = menu_items

    return link_map
Example #4
0
def find_all_download_links(client, menu_links, url, save=True):
    dl_list = DownloadList(url)
    for chapter, menu in menu_links.items():
        print('    scanning "{}"'.format(chapter))
        for subheading, href in menu.items():
            print('      in "{}"'.format(subheading))
            r = client.get(href)
            soup = BeautifulSoup(r.text, 'html.parser')

            # get the number of sections for this subheading (subheading)
            num_seq = len(soup.select('#sequence-list > li'))
            print("        {} sections".format(num_seq))

            # iterate through the sections to find links
            for i in range(num_seq):
                # unescape section contents to make it parseable by BeautifulSoup
                seq_contents = [unescape(x.text.replace("'", '')) for x in \
                                soup.select('#seq_contents_{}'.format(i))]
                seq = BeautifulSoup(seq_contents[0], 'html.parser')

                # If a video exists, add the link
                video = seq.select('.video-download-button > a')
                if len(video) >= 1:
                    dl_list.append([
                        chapter, subheading, 'section_{}'.format(i),
                        video[0]['href'], "main_content"
                    ])

                # Get all links in the section body area
                links = seq.find_all('a')
                for link in links:
                    link = link['href']
                    if link.endswith('.download'):
                        link = link.rstrip('.download')
                    if link[:4] != 'http':
                        link = 'https://courses.edx.org' + link
                    dl_list.append(
                        [chapter, subheading, 'section_{}'.format(i), link])
                    print('.', end='')
    if save:
        fn = os.path.join(os.getcwd(), dl_list.course + '_links.pkl')
        with open(fn, 'wb') as fh:
            pickle.dump(dl_list, fh)
    return dl_list
def prompts(url=None):
    email = input("Enter your edX account email: ")
    password = gpass("Enter your edX password: "******"Course url: "))
    return email, password, url
            'Enter the course number you\'d like to load, or "q" to quit: ')
        if prompt == 'q' or int(prompt) in range(0, i + 1):
            break
    if prompt == 'q':
        sys.exit()
    else:
        email, password, url = prompts(pkl_files[int(prompt)].url)
        saved_list = pkl_files[int(prompt)]
else:
    if len(sys.argv) < 2:
        email, password, url = prompts()
    else:
        email, password, url = sys.argv[1:]

    # check cached DownloadLists
    for pkl in pkl_files:
        if url == pkl.url or DownloadList.course_name(url) == pkl.course:
            print("\nA list of download links already exits for this course."
                  "\nDo you want to use it?")
            prompt = None
            while prompt not in ['y', 'n']:
                prompt = input(
                    "    Enter 'y' if yes, 'n' if you'd like to scrape all links again: "
                )
                if prompt == 'y':
                    saved_list = pkl
            break

# push the red button
run(email, password, url, saved_list)
def prompts(url=None):
    email = input("Enter your edX account email: ")
    password = gpass("Enter your edX password: "******"Course url: "))
    return email, password, url
        print("{}. {}\t".format(i, pkl.course))
    while True:
        prompt = input('Enter the course number you\'d like to load, or "q" to quit: ')
        if prompt == 'q' or int(prompt) in range(0, i+1):
            break
    if prompt == 'q':
        sys.exit()
    else:
        email, password, url = prompts(pkl_files[int(prompt)].url)
        saved_list = pkl_files[int(prompt)]
else:
    if len(sys.argv) < 2:
        email, password, url = prompts()
    else:
        email, password, url = sys.argv[1:]

    # check cached DownloadLists
    for pkl in pkl_files:
        if url == pkl.url or DownloadList.course_name(url) == pkl.course:
            print("\nA list of download links already exits for this course."
                  "\nDo you want to use it?")
            prompt = None
            while prompt not in ['y', 'n']:
                prompt = input("    Enter 'y' if yes, 'n' if you'd like to scrape all links again: ")
                if prompt == 'y':
                    saved_list = pkl
            break

# push the red button
run(email, password, url, saved_list)
Example #9
0
 def test_replace_punctuation(self):
     self.assertEqual(DownloadList.replace_punctuation(''), '')
     self.assertEqual(DownloadList.replace_punctuation('123'), '123')
     self.assertEqual(DownloadList.replace_punctuation(string.punctuation), '--------------------------_-----')
Example #10
0
 def test_course_name(self):
     self.assertEqual('course-v1-MITx-6-00-2x_4-3T2015',
                     DownloadList.course_name('https://courses.edx.org/courses/course-v1:MITx+6.00.2x_4+3T2015/courseware/8d9a47872ed641a1ace050f1c1ba7ac7/'))
Example #11
0
 def test_check_url(self):
     self.assertEqual('https://courses.edx.org/courses/course-v1:MITx+6.00.2x_4+3T2015/courseware/8d9a47872ed641a1ace050f1c1ba7ac7/',
                     DownloadList.check_url('https://courses.edx.org/courses/course-v1:MITx+6.00.2x_4+3T2015/courseware/8d9a47872ed641a1ace050f1c1ba7ac7/'))
     with self.assertRaises(ValueError):
         DownloadList.check_url('')
         DownloadList.check_url('http://example.com')
Example #12
0
#!/usr/bin/env python3

import os, sys, pickle
from getpass import getpass as gpass

from downedx import run
from dl_list import DownloadList

saved_list = None
dirfiles = os.listdir(os.getcwd())

# collect argument from the command line
if len(sys.argv) < 2:
    email = input("Enter your edX account email: ")
    password = gpass("Enter your edX password: "******"Course url: "))
else:
    email = sys.argv[1]
    password = sys.argv[2]
    url = DownloadList.check_url(sys.argv[3])

# load cached/pickled dl_link lists
pkl_files = [x for x in dirfiles if x.endswith('.pkl')]
if len(pkl_files) > 0:
    for pkl_file in pkl_files:
        with open(pkl_file, 'rb') as fh:
            pkl = pickle.load(fh)
            if url == pkl.url:
                print("\nA list of download links already exits for this course."
                      "\nDo you want to use it?")
                prompt = None