Esempio n. 1
0
 def get_nodes_by_ids_complete(self, studio_id):
     headers = {"Authorization": "Token {0}".format(self.token)}
     url = NODES_ENDPOINT + studio_id
     LOGGER.info('  GET ' + url)
     response = requests.get(url, headers=headers)
     studio_node = response.json()[0]
     return studio_node
Esempio n. 2
0
def add_subpages_from_wikipedia_list(topic, list_url):
    """ add_subpages_from_wikipedia_list: Parses wiki pages and creates corresponding files
        To understand how the following parsing works, look at:
            1. the source of the page (e.g. https://en.wikipedia.org/wiki/List_of_citrus_fruits), or inspect in chrome dev tools
            2. the documentation for BeautifulSoup version 4: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
    """
    page = read_source(list_url)        # Parse the the page into BeautifulSoup format, so we can loop through and manipulate it
    table = page.find("table")          # Extract the main table from the page

    # Loop through all the rows in the table
    for row in table.find_all("tr"):
        columns = row.find_all("td")    # Extract the columns (cells, really) within the current row
        if not columns:                 # Some rows are empty, so just skip
            continue

        link = columns[0].find("a")     # Get the link to the subpage
        if not link:                    # Some rows don't have links, so skip
            continue

        # Extract the URL and title for the subpage
        url = make_fully_qualified_url(link["href"])
        title = link.text
        LOGGER.info("      Writing {}...".format(title))

        # Attempt to extract a thumbnail for the subpage, from the second column in the table
        image = columns[1].find("img")
        thumbnail_url = make_fully_qualified_url(image["src"]) if image else None
        if thumbnail_url and not (thumbnail_url.endswith("jpg") or thumbnail_url.endswith("png")):
            thumbnail_url = None

        # Download the wikipedia page into an HTML5 app node
        html5app = download_wikipedia_page(url, thumbnail=thumbnail_url, title=title)

        # Add the downloaded HTML5 app node into the topic
        topic.add_child(html5app)
    def construct_channel(self, *args, **kwargs):
        """ construct_channel: Creates ChannelNode and build topic tree

            Wikipedia is organized with the following hierarchy:
                Citrus (Folder)
                |   Citrus Page HTML Zip (File)
                Potatoes (Folder)
                |   Potatoes Page HTML Zip (File)

            Returns: ChannelNode
        """
        LOGGER.info("Constructing channel from {}...".format(BASE_URL))

        channel = self.get_channel(
            *args,
            **kwargs)  # Creates ChannelNode from data in self.channel_info

        create_topic(channel, "Citrus!",
                     "List_of_citrus_fruits")  # Add Citrus folder
        create_topic(channel, "Potatoes!",
                     "List_of_potato_cultivars")  # Add Potatoes folder

        raise_for_invalid_channel(
            channel)  # Check for errors in channel construction

        return channel
Esempio n. 4
0
    def add_content_nodes(self, channel):
        """
        Build the hierarchy of topic nodes and content nodes.
        """
        LOGGER.info('Creating channel content nodes...')

        course_list = json.load(
            open(os.path.join(COURSES_DIR, 'course_list.json')))
        for course in course_list['courses']:  # [1:2]:
            basedir = os.path.join(COURSES_DIR, course['name'])
            coursedir = os.path.join(basedir, 'course')
            course_data = extract_course_tree(coursedir)
            course_id = course_data['course']
            write_tree_to_json_tree(
                os.path.join(ORIGINAL_TREES_DIR, course_id + '.json'),
                course_data)
            # print_course(course_data, translate_from='ar')
            clean_subtree(course_data, coursedir)
            print('Cleaned course', course_data['course'], '#' * 80)
            write_tree_to_json_tree(
                os.path.join(CLEAN_TREES_DIR, course_id + '.json'),
                course_data)
            transformed_tree = transform_tree(course_data, coursedir)
            write_tree_to_json_tree(
                os.path.join(TRANSFORMED_TREES_DIR, course_id + '.json'),
                transformed_tree)
            print_transfomed_tree(transformed_tree, translate_from='ar')
            channel['children'].append(transformed_tree)
            print('\n\n')
Esempio n. 5
0
def get_subtitle_languages(youtube_id):
    """
    Returns a list of the subtitle language codes available for a given video.
    We'll try to get the list using two approach:
    1. The Youtube API (works for public videos when YOUTUBE_API_KEY defined)
    2. Slow by using YouTubeResource, which in turn calls youtube_dl
    """
    # Check if we already have the lang_codes list for this youtube_id cached...
    cache_filename = '{}__lang_codes.json'.format(youtube_id)
    cache_filepath = os.path.join(SUBTITLE_LANGUAGES_CACHE_DIR, cache_filename)
    if os.path.exists(cache_filepath):  # Cache hit!
        with open(cache_filepath) as jsonf:
            cache_data = json.load(jsonf)
            return cache_data['lang_codes']

    if YOUTUBE_API_KEY:
        try:
            lang_codes = get_subtitles_using_api(youtube_id)
            return lang_codes
        except HttpError as e:
            LOGGER.info("Can't access API for video {} ...".format(youtube_id))
    lang_codes = get_subtitles_using_youtube_dl(youtube_id)

    # Cache the results in chefdata/sublangscache/{youtube_id}__lang_codes.json
    cache_data = {"lang_codes": lang_codes}
    with open(cache_filepath, 'w') as jsonf:
        json.dump(cache_data, jsonf, ensure_ascii=True)

    return lang_codes
Esempio n. 6
0
    def pre_run(self, args, options):
        """
        Build the ricecooker json tree for the channel.
        The code here is similar to the code in `ricecooker_channel/chef.py`, but
        the channel hiearachy is build using dictionary objects instead of classes.
        """
        LOGGER.info('In pre_run...')

        # 1. Create the channel tree
        ricecooker_json_tree = dict(
            title='Sample JSON channel',
            source_domain='source.org',
            source_id='sample-json-channel',
            description='This channel was created from the files in the content/ ' \
                + 'directory and the metadata in sample_ricecooker_json_tree.json',
            thumbnail='./content/sample-json-channel-files/channel_thumbnail.jpg',
            language='en',
            children=[],
        )
        # The root object of the ricecooker json tree contains the channel info;
        # add topic and content nodes and to the children list to build the tree.

        # 2. Add topics nodes and content nodes as to the tree
        self.create_content_nodes(ricecooker_json_tree)
        self.create_exercise_nodes(ricecooker_json_tree)

        # 3. Save the tree to chefdata/trees/sample_ricecooker_json_tree.json
        json_tree_path = self.get_json_tree_path()
        write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
        LOGGER.info('Finished writing ricecooker json tree.')
Esempio n. 7
0
    def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None):
        youtube_info = None
        # 1. Try to get from cache if allowed:
        if os.path.exists(self.cache_path) and use_cache:
            LOGGER.info("==> [%s] Retrieving cached information...", self.__str__())
            youtube_info = json.load(open(self.cache_path))
        # 2. Fetch info from youtube_dl
        if not youtube_info:
            LOGGER.info("==> [%s] Requesting info from youtube...", self.__str__())
            os.makedirs(self.cache_dir, exist_ok=True)
            try:
                youtube_resource = YouTubeResource(self.url, useproxy=use_proxy)
            except youtube_dl.utils.ExtractorError as e:
                if "unavailable" in str(e):
                    LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url)
                    return None

            if youtube_resource:
                try:
                    # Save YouTube info to JSON cache file
                    youtube_info = youtube_resource.get_resource_info(options)
                    if youtube_info:
                        json.dump(youtube_info,
                                  open(self.cache_path, 'w'),
                                  indent=4,
                                  ensure_ascii=False,
                                  sort_keys=True)
                    else:
                        LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__())
                except Exception as e:
                    LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e)
                    return None
        return youtube_info
def crawling_part():
    """
    Visit all the urls on engageny.org/resource/ and engageny.org/content, and extract content structure.
    """
    # crawl website to build web_resource_tree
    ela_hierarchy, math_hierarchy = crawl(ENGAGENY_CC_START_URL)
    web_resource_tree = dict(
        kind="EngageNYWebResourceTree",
        title="Engage NY Web Resource Tree (ELS and CCSSM)",
        language='en',
        children={
            'math': {
                'grades': math_hierarchy,
            },
            'ela': {
                'grades': ela_hierarchy,
            },
        },
    )
    json_file_name = os.path.join(TREES_DATA_DIR, CRAWLING_STAGE_OUTPUT)
    with open(json_file_name, 'w') as json_file:
        json.dump(web_resource_tree, json_file, indent=2)
        LOGGER.info('Crawling results stored in ' + json_file_name)

    return web_resource_tree
Esempio n. 9
0
 def recursive_extract_website_games(subtree):
     """
     Processes all child nodes of the subtree then calls itself on any folder-like
     child nodes. Weird, I know, but it works so I'm not touching it.
     """
     if 'children' in subtree:
         # do processing
         new_children = []
         for child in subtree['children']:
             child_url = child['url']
             if child['kind'] == 'PrathamZipResource':
                 if is_website_game(child_url):
                     # extract all game names referenced in manual curation Excel file to process separately...
                     child_url = child_url.replace(
                         'https://www.prathamopenschool.org/CourseContent/Games/',
                         '')
                     child_url = child_url.replace(
                         'http://www.prathamopenschool.org/CourseContent/Games/',
                         '')
                     child['title_en'] = child_url.replace('.zip', '')
                     print('EXTRACTED game name', child['title_en'],
                           'form url', child['url'])
                     website_games.append(child)
                 else:
                     # leave other games where they are
                     LOGGER.info('Undocumented game-like web resource ' +
                                 child['url'])
                     new_children.append(child)
             else:
                 # leave other content as is
                 new_children.append(child)
         #
         # recurse
         for child in subtree['children']:
             recursive_extract_website_games(child)
Esempio n. 10
0
 def get_nodes_by_ids_bulk(self, studio_ids):
     """
     A more efficient version of `get_nodes_by_ids_complete` that GETs tree
     content node data in chunks of 10 from the Studio API.
     """
     CHUNK_SIZE = 25
     NODES_ENDPOINT = self.studio_url + '/api/get_nodes_by_ids_complete/'
     headers = {"Authorization": "Token {0}".format(self.token)}
     studio_nodes = []
     studio_ids_chunks = [
         studio_ids[i:i + CHUNK_SIZE]
         for i in range(0, len(studio_ids), CHUNK_SIZE)
     ]
     for studio_ids_chunk in studio_ids_chunks:
         studio_ids_csv = ','.join(studio_ids_chunk)
         url = NODES_ENDPOINT + studio_ids_csv
         LOGGER.info('  GET ' + url)
         response = requests.get(url, headers=headers)
         chunk_nodes = response.json()
         for chunk_node in chunk_nodes:
             if 'children' in chunk_node:
                 child_nodes = self.get_nodes_by_ids_bulk(
                     chunk_node['children'])
                 chunk_node['children'] = child_nodes
         studio_nodes.extend(chunk_nodes)
     return studio_nodes
Esempio n. 11
0
    def construct_channel(self, *args, **kwargs):
        """ construct_channel: Creates ChannelNode and build topic tree

            Solar Spell is organized with the following hierarchy(Sample):
                Creative Arts (source_id = dir-creative-arts)
                |--- Culinary Arts (source_id = dir-culinary-arts)
                |--- |--- Real Pasifik 2 introducing Chef Alexis Tahiapuhe of Tahiti (source_id = file-real pasifik 2 introducing chef lela bolobolo of fiji.mp4)
                |--- Pacific Islands Arts and Culture(source_id = dir_pacific_islands_arts_and_culture)
                |--- |--- Cook Islands National Cultural Policy 10 July 2017_final english (File)
                |--- Teaching Resources and Classroom Activities
                Environment (source_id = dir-environment)
                |--- Adapting to Climate Change
                |--- |--- Action Against Climate Change Tuvalu Water and climate change
                |--- Climate Change Info                
                |--- |--- Animated Pacific Island Climate Change Videos
                ...
            Returns: ChannelNode
        """
        LOGGER.info("Constructing channel from {}...".format(BASE_URL))
        channel = self.get_channel(
            *args,
            **kwargs)  # Creates ChannelNode from data in self.channel_info
        LOGGER.info('   Writing {} Folder...'.format(CHANNEL_NAME))
        endpoint = BASE_URL + "content/"
        scrape_content(endpoint, channel)
        raise_for_invalid_channel(
            channel)  # Check for errors in channel construction
        return channel
Esempio n. 12
0
def scrape_multilanguage_slideshows(channel):
    LOGGER.info('Scraping multi-language content...')
    contents = BeautifulSoup(downloader.read(SLIDESHOWS_URL), 'html5lib')
    collection_key = get_collection_key(contents)

    languages_selection = contents.find('div', {
        'class': 'asset-list'
    }).find('div')
    language_list = json.loads(
        languages_selection['data-react-props'])['sections']

    for language in language_list:
        asset_url = SLIDESHOW_ASSETS_URL.format(
            collection='qac6i4-foozd4-68u325', section=language['section_key'])
        slide_data = json.loads(downloader.read(asset_url))['data']
        translated_name = languages.getlang(
            LANGUAGE_MAP[language['name']]).native_name if LANGUAGE_MAP[
                language['name']] else language['name']
        LOGGER.info('    {}'.format(translated_name.encode('utf-8')))

        slides = [{
            'url':
            slide['attributes']['thumbnail_url'].replace(
                'element.png', '*****@*****.**')
        } for slide in slide_data]
        if len(slides):
            channel.add_child(
                create_slideshow(slides, asset_url, translated_name,
                                 language['name']))
Esempio n. 13
0
def scrape_english_collection(channel):
    LOGGER.info('Scraping English collection...')
    english_topic = nodes.TopicNode(source_id=ENGLISH_COLLECTION_URL,
                                    title="English")
    channel.add_child(english_topic)

    contents = BeautifulSoup(downloader.read(ENGLISH_COLLECTION_URL),
                             'html5lib')
    collection_key = get_collection_key(contents)

    topic_selection = contents.find('div', {'class': 'asset-list'}).find('div')
    topic_list = [
        t for t in json.loads(topic_selection['data-react-props'])['sections']
        if t['id'] not in EXCLUDED_TOPIC_IDS
    ]

    for topic in topic_list:
        LOGGER.info('    {}'.format(topic['name'].encode('utf-8')))
        topic_node = nodes.TopicNode(source_id=topic['section_key'],
                                     title=topic['name'])
        english_topic.add_child(topic_node)

        # Scrape items in the topic
        url = ENGLISH_ASSETS_URL.format(collection=collection_key,
                                        section=topic['section_key'])
        scrape_collection_files(topic_node, url)
def scrape_channel(channel):
    # Read from Categorias dropdown menu
    page = BeautifulSoup(downloader.read(BASE_URL), 'html5lib')
    dropdown = page.find('a', {'id': 'btn-categorias'}).find_next_sibling('ul')

    # Go through dropdown and generate topics and subtopics
    for category_list in dropdown.find_all('li', {'class': 'has-children'}):

        # Parse categories
        for category in category_list.find_all('li', {'class': 'has-children'}):
            # Add this topic to channel when scraping entire channel
            category_name = category.find('a').text
            topic = nodes.TopicNode(title=category_name, source_id=get_source_id(category_name))
            channel.add_child(topic)
            LOGGER.info(topic.title)

            # Parse subcategories
            for subcategory in category.find_all('li'):
                if not subcategory.attrs.get('class') or 'go-back' not in subcategory.attrs['class']:
                    # Get rid of this check to scrape entire site
                    subcategory_name = subcategory.find('a').text
                    subcategory_link = subcategory.find('a')['href']
                    LOGGER.info('  {}'.format(subcategory_name))
                    subtopic = nodes.TopicNode(title=subcategory_name, source_id=get_source_id(subcategory_link))
                    topic.add_child(subtopic)

                    # Parse resources
                    scrape_subcategory(subcategory_link, subtopic)
def scrape_resource(url, topic):
    resource = BeautifulSoup(downloader.read(url), 'html5lib')
    LOGGER.info('      {}'.format(resource.find('h2').text))

    filepath = download_resource(resource.find('div', {'class': 'decargas'}).find('a')['href'])
    license = None
    author = ''
    for data_section in resource.find('div', {'class': 'datos_generales'}).find_all('h4'):
        if 'Licencia' in data_section.text:
            try:
                license = LICENSE_MAP[data_section.find_next_sibling('p').text](copyright_holder="Ceibal")
            except KeyError as e:
                LOGGER.error(str(e))
                license = licenses.CC_BYLicense
        elif 'Autor' in data_section.text:
            author = data_section.find_next_sibling('p').text
    if filepath:
        thumbnail = resource.find('div', {'class': 'img-recurso'}).find('img')['src']
        if thumbnail.endswith('.gif'):
            thumbnail = os.path.sep.join([DOWNLOAD_DIRECTORY, thumbnail.split('/')[-1].replace('.gif', '.png')])
            with open(thumbnail, 'wb') as fobj:
                fobj.write(downloader.read(resource.find('div', {'class': 'img-recurso'}).find('img')['src']))

        topic.add_child(nodes.HTML5AppNode(
            title=resource.find('h2').text,
            source_id=url,
            license=license,
            author=author,
            description=resource.find('form').find_all('p')[1].text,
            thumbnail=thumbnail,
            tags = [tag.text[:30] for tag in resource.find_all('a', {'class': 'tags'})],
            files=[files.HTMLZipFile(path=filepath)],
        ))
def scrape_snack_subject(slug, topic):
    """ Scrape snack subject page
        Args:
            slug (str): url slug to scrape from (e.g. /subject/arts)
            topic (TopicNode): topic to add html nodes to
    """
    contents = BeautifulSoup(read(slug), 'html5lib')

    for activity in contents.find_all('div', {'class': 'activity'}):
        LOGGER.info("        {}".format(activity.find('h5').text.strip()))
        # Scrape snack pages into zips
        write_to_path, tags = scrape_snack_page(activity.find('a')['href'])
        if not write_to_path:
            continue

        # Create html node
        description = activity.find('div', {'class': 'pod-description'})
        topic.add_child(
            nodes.HTML5AppNode(
                source_id=activity.find('a')['href'],
                title=activity.find('h5').text.strip().replace("’", "'"),
                description=description.text.strip() if description else "",
                license=LICENSE,
                copyright_holder=COPYRIGHT_HOLDER,
                files=[files.HTMLZipFile(path=write_to_path)],
                thumbnail=get_thumbnail_url(activity.find('img')['src']),
                tags=tags,
            ))

    # Scrape next page (if any)
    next_page_url = get_next_page_url(contents)
    if next_page_url:
        scrape_snack_subject(next_page_url, topic)
def get_html5_app_zip_path(slug):
    resp = session.get(READ_URL.format(slug))
    if resp.status_code == 200:
        resp = resp.json()
    else:
        LOGGER.info('The story {} is not available.\n'.format(slug))
        return None
    content = ""
    for page in (resp['data']['pages']):
        soup = BeautifulSoup(page['html'], 'html.parser')
        if page.get('coverImage', None):
            img_src = page['coverImage']['sizes'][-1]['url']
            soup.img['src'] = img_src
        content = content + "\n" + str(soup)

    context = {'content': content}

    handle, destination = tempfile.mkstemp(suffix=".zip")
    os.close(handle)
    htmlwriter = HTMLWriter(destination)
    with htmlwriter as f:
        index_html = TEMPLATE_ENVIRONMENT.get_template('indexfile').render(
            context)
        f.write_index_contents(index_html)

    LOGGER.info(destination)
    return destination
def scrape_video_menu(url):
    """ Scrape videos from url
        Args:
            url (str): url to scrape from (e.g. https://www.exploratorium.edu/video/subjects)
        Returns TopicNode containing all videos
    """
    LOGGER.info("SCRAPING VIDEOS...")
    video_topic = nodes.TopicNode(title="Videos",
                                  source_id="main-topic-videos")
    contents = BeautifulSoup(read(url), 'html5lib')

    for subject in contents.find_all('div', {'class': 'subject'}):
        title = subject.find('div', {
            'class': 'name'
        }).text.strip().replace("’", "'")
        LOGGER.info("    {}".format(title))
        topic = nodes.TopicNode(
            title=title,
            source_id="videos-{}".format(title),
            thumbnail=get_thumbnail_url(subject.find('img')['src']),
        )
        video_topic.add_child(topic)
        scrape_video_subject(subject.find('a')['href'], topic)

    return video_topic
Esempio n. 19
0
    def write_question_row_from_question_dict(self, source_id, question_dict):
        file_path = get_metadata_file_path(self.channeldir, self.questionsinfo)
        with open(file_path, 'a') as csv_file:
            csvwriter = csv.DictWriter(csv_file, EXERCISE_QUESTIONS_INFO_HEADER)

            def _safe_list_get(l, idx, default):
                try:
                    return l[idx]
                except IndexError:
                    return default

            # change image links to local
            question_dict = self._make_local_question_images(question_dict)

            type_lookup = {
                'single_selection': exercises.SINGLE_SELECTION,
                'true_false': exercises.SINGLE_SELECTION,
                'multiple_selection': exercises.MULTIPLE_SELECTION,
                'input_question': exercises.INPUT_QUESTION,
            }

            # ANSWERS
            answers = json.loads(question_dict['answers'])
            options = []  # all options
            correct = []  # correct andwers
            for ans in answers:
                options.append(ans['answer'])
                if ans['correct']:
                    correct.append(ans['answer'])
            extra_options = DEFAULT_EXTRA_ITEMS_SEPARATOR.join(options[5:])

            # HINTS
            hints_raw = json.loads(question_dict['hints'])
            if hints_raw:
                raise ValueError('Found hints but not handled..')

            LOGGER.info('     - writing question with studio_id=' + question_dict['assessment_id'])
            question_row = {}
            question_row[EXERCISE_SOURCEID_KEY] = source_id
            question_row[EXERCISE_QUESTIONS_QUESTIONID_KEY] = question_dict['assessment_id'] # question_dict['assessment_id']
            question_row[EXERCISE_QUESTIONS_TYPE_KEY] = type_lookup[question_dict['type']]
            question_row[EXERCISE_QUESTIONS_QUESTION_KEY] = question_dict['question']
            question_row[EXERCISE_QUESTIONS_OPTION_A_KEY] = _safe_list_get(options, 0, None)
            question_row[EXERCISE_QUESTIONS_OPTION_B_KEY] = _safe_list_get(options, 1, None)
            question_row[EXERCISE_QUESTIONS_OPTION_C_KEY] = _safe_list_get(options, 2, None)
            question_row[EXERCISE_QUESTIONS_OPTION_D_KEY] = _safe_list_get(options, 3, None)
            question_row[EXERCISE_QUESTIONS_OPTION_E_KEY] = _safe_list_get(options, 4, None)
            question_row[EXERCISE_QUESTIONS_OPTION_FGHI_KEY] = extra_options
            question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY] = _safe_list_get(correct, 0, None)
            question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY] = _safe_list_get(correct, 1, None)
            question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY] = _safe_list_get(correct, 2, None)
            question_row[EXERCISE_QUESTIONS_HINT_1_KEY] = None # TODO
            question_row[EXERCISE_QUESTIONS_HINT_2_KEY] = None # TODO
            question_row[EXERCISE_QUESTIONS_HINT_3_KEY] = None # TODO
            question_row[EXERCISE_QUESTIONS_HINT_4_KEY] = None # TODO
            question_row[EXERCISE_QUESTIONS_HINT_5_KEY] = None # TODO
            question_row[EXERCISE_QUESTIONS_HINT_6789_KEY] = None # TODO
            # WRITE QUESTION ROW
            csvwriter.writerow(question_row)
Esempio n. 20
0
    def construct_channel(self, *args, **kwargs):
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        lang_names = list(self.data.keys())
        lang_names.sort()

        for lang_name in lang_names:
            lang_data = self.data[lang_name]
            LOGGER.info("Creating app for language: {}".format(lang_name))
            lang = languages.getlang_by_native_name(lang_name)

            zip_dir = self.client.create_zip_dir_for_page(lang_data['url'])

            soup = self.client.get_page_soup(lang_data['url'])

            # Remove the translation list if found
            translations = soup.find('div', {'id': 'translations'})
            if translations:
                translations.extract()

            # Grab the localized title
            title = soup.find('span', {'id': 'share_title'}).text

            # Save the modified index.html page
            thumbnail = None
            for resource in lang_data['resources']:
                if 'dp3t.png' in resource:
                    thumbnail = os.path.join(zip_dir, resource)
                    break

            with open(os.path.join(zip_dir, 'index.html'), 'wb') as f:
                f.write(soup.prettify(encoding='utf-8'))

            # create_predictable_zip ensures that the ZIP file does not change each time it's created. This
            # ensures that the zip doesn't get re-uploaded just because zip metadata changed.
            zip_file = zip.create_predictable_zip(zip_dir)
            zip_name = lang.primary_code if lang else lang_name
            zip_filename = os.path.join(self.ZIP_DIR,
                                        "{}.zip".format(zip_name))
            os.makedirs(os.path.dirname(zip_filename), exist_ok=True)
            os.rename(zip_file, zip_filename)

            topic = nodes.TopicNode(source_id=lang_name, title=lang_name)
            zip_node = nodes.HTML5AppNode(
                source_id="covid19-sim-{}".format(lang_name),
                title=title,
                files=[files.HTMLZipFile(zip_filename)],
                license=licenses.PublicDomainLicense(
                    "Marcel Salathé & Nicky Case"),
                language=lang,
                thumbnail=thumbnail)
            topic.add_child(zip_node)
            channel.add_child(topic)

        return channel
def scrape_subcategory(link, topic):
    url = "{}{}".format(BASE_URL, link.lstrip("/"))
    resource_page = BeautifulSoup(downloader.read(url), 'html5lib')

    # Skip "All" category
    for resource_filter in resource_page.find('div', {'class': 'menu-filtro'}).find_all('a')[1:]:
        LOGGER.info('    {}'.format(resource_filter.text))
        source_id = get_source_id('{}/{}'.format(topic.title, resource_filter.text))
        filter_topic = nodes.TopicNode(title=resource_filter.text, source_id=source_id)
        scrape_resource_list(url + resource_filter['href'], filter_topic)
        topic.add_child(filter_topic)
Esempio n. 22
0
 def get_nodes_by_ids_complete(self, studio_id):
     """
     Get the complete JSON representation of a content node from the Studio API.
     """
     NODES_ENDPOINT = self.studio_url + '/api/get_nodes_by_ids_complete/'
     headers = {"Authorization": "Token {0}".format(self.token)}
     url = NODES_ENDPOINT + studio_id
     LOGGER.info('  GET ' + url)
     response = requests.get(url, headers=headers)
     studio_node = response.json()[0]
     return studio_node
Esempio n. 23
0
 def _recusive_visit_rm_global_nav_children(subtree):
     newchildren = []
     for child in subtree['children']:
         child_url = child['url']
         if len(child['children']
                ) == 0 and child_url in global_nav_urls:
             LOGGER.info('Removing global nav url =' + child_url)
         else:
             clean_child = _recusive_visit_rm_global_nav_children(child)
             newchildren.append(clean_child)
     subtree['children'] = newchildren
     return subtree
    def pre_run(self, args, options):
        """
        This is where all the works happens for this chef:
        - Load the source tree from the Khan Academy API
        - Convert the tree of Khan-objects in ricecooker_json dicts objects
        - Write ricecooker json tree to the appropriate file
        """
        lang, variant = self.parse_lang_and_variant_from_kwargs(options)

        if lang == "en" and variant != "in-in":
            # Load the CCSSM tags for the KA en channel (but not in-in variant)
            global CC_MAPPING
            CC_MAPPING = generate_common_core_mapping()

        channel_node = self.get_channel_dict(options)
        channel_node["children"] = []

        # Handle special case of building Kolibri channel from youtube playlists
        if options.get("youtube_channel_id"):
            youtube_channel_id = options.get("youtube_channel_id")
            LOGGER.info("Found YouTube channel {}".format(youtube_channel_id))
            root_node = youtube_playlist_scraper(youtube_channel_id,
                                                 channel_node)
            json_tree_path = self.get_json_tree_path(**options)
            LOGGER.info("Writing youtube ricecooker tree to " + json_tree_path)
            write_tree_to_json_tree(json_tree_path, root_node)
            return None

        LOGGER.info("Downloading KA topic tree")
        # Obtain the complete topic tree for lang=lang from the KA API
        ka_root_topic, topics_by_slug = get_khan_topic_tree(lang=lang)
        # TODO: discuss w @kollivier introducing "archive" step here (for source diffs)
        self.topics_by_slug = topics_by_slug  # to be used for topic replacments
        self.slug_blacklist = get_slug_blacklist(lang=lang, variant=variant)
        self.topic_replacements = get_topic_tree_replacements(lang=lang,
                                                              variant=variant)

        if options.get("english_subtitles"):
            # we will include english videos with target language subtitles
            duplicate_videos(ka_root_topic)

        LOGGER.info("Converting KA nodes to ricecooker json nodes")
        root_topic = self.convert_ka_node_to_ricecooker_node(ka_root_topic,
                                                             target_lang=lang)
        for topic in root_topic["children"]:
            channel_node["children"].append(topic)

        # write to ricecooker tree to json file
        json_tree_path = self.get_json_tree_path(**options)
        LOGGER.info("Writing ricecooker json tree to " + json_tree_path)
        write_tree_to_json_tree(json_tree_path, channel_node)
def download_structure_csv(which=None):
    if which == 'English':
        response = requests.get(PRADIGI_ENGLISH_SHEET_CSV_URL)
        csv_data = response.content.decode('utf-8')
        with open(PRADIGI_ENGLISH_SHEET_CSV_PATH, 'w') as csvfile:
            csvfile.write(csv_data)
            LOGGER.info('Succesfully saved ' + PRADIGI_ENGLISH_SHEET_CSV_PATH)
        return PRADIGI_ENGLISH_SHEET_CSV_PATH
    else:
        response = requests.get(PRADIGI_SHEET_CSV_URL)
        csv_data = response.content.decode('utf-8')
        with open(PRADIGI_SHEET_CSV_PATH, 'w') as csvfile:
            csvfile.write(csv_data)
            LOGGER.info('Succesfully saved ' + PRADIGI_SHEET_CSV_PATH)
        return PRADIGI_SHEET_CSV_PATH
Esempio n. 26
0
    def scrape_video_page(self, url, title):
        """ Creates a video topic with all the videos on the page """
        IGNORED_VIDEOS = ['google', 'facebook']
        VIDEO_SCRAPERS = [who.WHOWebVideoScraper, who.WHOVideoScraper]

        video_topic = nodes.TopicNode(source_id=url, title=title)
        contents = BeautifulSoup(downloader.read(url), 'html.parser')

        # Scrape youtube embeds
        # e.g. https://www.who.int/emergencies/diseases/novel-coronavirus-2019/advice-for-public/videos
        for iframe in contents.findAll('iframe'):
            if not any(
                [test for test in IGNORED_VIDEOS if test in iframe['src']]):
                header = iframe.find_parent('div', {
                    'class': 'sf_colsIn'
                }).find('div', {
                    'class': 'section-heading'
                }).text.strip()
                LOGGER.info('      - Downloading {}'.format(
                    header.encode('utf-8')))
                scraper = guess_scraper(iframe['src'], scrapers=VIDEO_SCRAPERS
                                        )  # Might be native or youtube video
                video_node = scraper.to_contentnode(header,
                                                    license=LICENSE,
                                                    directory="videos")
                video_topic.add_child(video_node)

        # Scrape native videos
        # e.g. https://www.who.int/zh/emergencies/diseases/novel-coronavirus-2019/advice-for-public/videos
        for video in contents.findAll('div',
                                      {'class': 'sf-multimedia-item__video'}):
            header = video.find('h3').text.strip()
            LOGGER.info('      - Downloading {}'.format(
                header.encode('utf-8')))
            video_matches = re.search(r"\(\s*\"(.+)\"\,\s*\"(.+)\"\)",
                                      video.find('a')['onclick'])

            # Embedded youtube videos here refer to playlists, so skip them
            if 'YoutubeVideo' == video_matches.group(1):
                continue

            scraper = who.WHOVideoScraper(video_matches.group(2))
            video_node = scraper.to_contentnode(header,
                                                license=LICENSE,
                                                directory="videos")
            video_topic.add_child(video_node)

        return video_topic
def scrape_video_collection(url, topic):
    """ Scrape videos under video collection and add to the topic node
        Args:
            url (str): url to video page (e.g. https://www.exploratorium.edu/video/inflatable-jimmy-kuehnle)
            topic (TopicNode): topic to add video nodes to
    """
    try:
        collection_contents = BeautifulSoup(read(url), 'html5lib')
        for result in collection_contents.find_all('div',
                                                   {'class': 'search-result'}):
            header = result.find('div',
                                 {'class': 'views-field-field-html-title'})
            LOGGER.info("            {}".format(header.text.strip()))

            # Get video from given url
            description = result.find('div', {'class': 'search-description'})
            video_contents = BeautifulSoup(read(header.find('a')['href']),
                                           'html.parser')
            for k, v in get_brightcove_mapping(video_contents).items():
                video_node = nodes.VideoNode(
                    source_id=k,
                    title=header.text.strip().replace("’", "'"),
                    description=description.text.strip()
                    if description else "",
                    license=LICENSE,
                    copyright_holder=COPYRIGHT_HOLDER,
                    author=v.get('author') or "",
                    files=[
                        files.WebVideoFile(v['url'], high_resolution=False)
                    ],
                    thumbnail=get_thumbnail_url(result.find('img')['src']),
                )

                # If video doesn't already exist here, add to topic
                if not next((c for c in topic.children
                             if c.source_id == video_node.source_id), None):
                    topic.add_child(video_node)

        # Scrape next page (if any)
        next_page_url = get_next_page_url(collection_contents)
        if next_page_url:
            scrape_video_collection(next_page_url, topic)

    except requests.exceptions.HTTPError:
        LOGGER.error("Could not read collection at {}".format(url))
def get_phet_zip_file(zip_file_url, main_file_and_query):
    """
    Phet simulations are provided in the zip file `phet.zip`, and the entry point
    is passed as a GET parameter in `main_file_and_query`. To make these compatible
    with Kolibri's default behaviour of loading index.html, we will:
      - Rename index.html to phetindex.thml
      - Add a custom index.html that uses javascrpt redirect to phetindex.thml?{sim_id}
    """
    u = urlparse(main_file_and_query)
    idk, sim_id = u.query.split('=')
    assert idk == 'id', 'unknown query sting format found' + main_file_and_query
    main_file = u.scheme + '://' + u.netloc + u.path  # skip querystring

    destpath = tempfile.mkdtemp()
    LOGGER.info('saving phet zip file in dir ' + destpath)
    try:
        download_file(zip_file_url, destpath, request_fn=make_request)

        zip_filename = zip_file_url.split('/')[-1]
        zip_basename = zip_filename.rsplit('.', 1)[0]
        zip_folder = os.path.join(destpath, zip_basename)

        # Extract zip file contents.
        local_zip_file = os.path.join(destpath, zip_filename)
        with zipfile.ZipFile(local_zip_file) as zf:
            zf.extractall(destpath)

        # Rename main_file to phetindex.html
        main_file = main_file.split('/')[-1]
        src = os.path.join(zip_folder, main_file)
        dest = os.path.join(zip_folder, 'phetindex.html')
        os.rename(src, dest)

        # Create the
        index_html = PHET_INDEX_HTML_TEMPLATE.format(sim_id=sim_id)
        with open(os.path.join(zip_folder, 'index.html'), 'w') as indexf:
            indexf.write(index_html)

        # Always be zipping!
        return create_predictable_zip(zip_folder)

    except Exception as e:
        LOGGER.error("get_phet_zip_file: %s, %s, %s, %s" %
                     (zip_file_url, main_file_and_query, destpath, e))
        return None
Esempio n. 29
0
    def pre_run(self, args, options):
        """
        Build the ricecooker json tree for the entire channel
        """
        LOGGER.info('in pre_run...')

        # delete .zip files in temporary dir when running using update
        if args['update']:
            LOGGER.info('Deleting all zips in cache dir {}'.format(
                HTML5APP_ZIPS_LOCAL_DIR))
            for rel_path in os.listdir(HTML5APP_ZIPS_LOCAL_DIR):
                abs_path = os.path.join(HTML5APP_ZIPS_LOCAL_DIR, rel_path)
                if os.path.isdir(abs_path):
                    shutil.rmtree(abs_path)

        # option to skip crawling stage
        if 'nocrawl' not in options:
            self.crawl(args, options)

        # Conditionally determine `source_id` depending on variant specified
        if 'variant' in options and options['variant'].upper() == 'LE':
            # Official PraDigi channel =
            channel_name = 'PraDigi'
            channel_source_id = PRADIGI_SOURCE_ID__VARIANT_LE
            DEBUG_MODE = False
        else:
            # Pratham ETL (used to import content from website into Pratham app)
            # channel_id = f9da12749d995fa197f8b4c0192e7b2c
            channel_name = 'PraDigi Pratham'
            channel_source_id = PRADIGI_SOURCE_ID__VARIANT_PRATHAM

        ricecooker_json_tree = dict(
            title=channel_name,
            source_domain=PRADIGI_DOMAIN,
            source_id=channel_source_id,
            description=PRADIGI_DESCRIPTION,
            thumbnail='chefdata/prathamlogo_b01-v1.jpg',
            language='mul',
            children=[],
        )
        for lang in PRADIGI_WEBSITE_LANGUAGES:
            lang_subtree = self.build_subtree_for_lang(lang)
            ricecooker_json_tree['children'].append(lang_subtree)
        json_tree_path = self.get_json_tree_path()
        write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
def get_subtopics(parent, path):
    doc = get_page(path)
    try:
        menu_row = doc.find('div', {'id': 'body-row'})
        menu_row = menu_row.find('div', {'class': 'col-md-2'})
    except Exception as e:
        LOGGER.error('get_subtopics: %s : %s' % (e, doc))
        return
    for subtopic in menu_row.find_all('a'):
        try:
            title = subtopic.get_text().strip()
            source_id = get_source_id(subtopic['href'])
            LOGGER.info('  subtopic: %s: %s' % (source_id, title))
            node = TopicNode(title=title, source_id=source_id)
            parent.add_child(node)
            get_lessons(node, subtopic['href'])
        except Exception as e:
            LOGGER.error('get_subtopics: %s : %s' % (e, subtopic))