Example #1
0
def add_questions(exercise_node, question_list):
    EXPECTED_QUESTION_TYPES = [
        INPUT_QUESTION, MULTIPLE_SELECTION, SINGLE_SELECTION, FREE_RESPONSE,
        PERSEUS_QUESTION
    ]

    for q in question_list:
        question_type = q.get('question_type')
        if question_type not in EXPECTED_QUESTION_TYPES:
            LOGGER.critical(question_type)
            raise NotImplementedError(
                'Unexpected question type found in channel json.')

        question_text = q.get('question')
        hints = q.get('hints')
        hints = hints if isinstance(hints,
                                    str) else [hint for hint in hints or []]

        if question_type == exercises.MULTIPLE_SELECTION:
            q_obj = questions.MultipleSelectQuestion(
                id=q['id'],
                question=question_text,
                correct_answers=[answer for answer in q['correct_answers']],
                all_answers=[answer for answer in q['all_answers']],
                hints=hints,
            )
            exercise_node.add_question(q_obj)

        elif question_type == exercises.SINGLE_SELECTION:
            q_obj = questions.SingleSelectQuestion(
                id=q['id'],
                question=question_text,
                correct_answer=q['correct_answer'],
                all_answers=[answer for answer in q['all_answers']],
                hints=hints,
            )
            exercise_node.add_question(q_obj)

        elif question_type == exercises.INPUT_QUESTION:
            q_obj = questions.InputQuestion(
                id=q['id'],
                question=question_text,
                answers=[answer for answer in q['answers']],
                hints=hints,
            )
            exercise_node.add_question(q_obj)

        elif question_type == exercises.PERSEUS_QUESTION:
            q_obj = questions.PerseusQuestion(
                id=q['id'],
                raw_data=q.get('item_data'),
                source_url="https://www.khanacademy.org/",
            )
            exercise_node.add_question(q_obj)

        else:
            raise UnknownQuestionTypeError(
                "Unrecognized question type '{0}': accepted types are {1}".
                format(question_type,
                       [key for key, value in exercises.question_choices]))
def get_html5_app_zip_path(slug):
    resp = session.get(READ_URL.format(slug))
    if resp.status_code == 200:
        resp = resp.json()
    else:
        LOGGER.info('The story {} is not available.\n'.format(slug))
        return None
    content = ""
    for page in (resp['data']['pages']):
        soup = BeautifulSoup(page['html'], 'html.parser')
        if page.get('coverImage', None):
            img_src = page['coverImage']['sizes'][-1]['url']
            soup.img['src'] = img_src
        content = content + "\n" + str(soup)

    context = {'content': content}

    handle, destination = tempfile.mkstemp(suffix=".zip")
    os.close(handle)
    htmlwriter = HTMLWriter(destination)
    with htmlwriter as f:
        index_html = TEMPLATE_ENVIRONMENT.get_template('indexfile').render(
            context)
        f.write_index_contents(index_html)

    LOGGER.info(destination)
    return destination
Example #3
0
    def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None):
        youtube_info = None
        # 1. Try to get from cache if allowed:
        if os.path.exists(self.cache_path) and use_cache:
            LOGGER.info("==> [%s] Retrieving cached information...", self.__str__())
            youtube_info = json.load(open(self.cache_path))
        # 2. Fetch info from youtube_dl
        if not youtube_info:
            LOGGER.info("==> [%s] Requesting info from youtube...", self.__str__())
            os.makedirs(self.cache_dir, exist_ok=True)
            try:
                youtube_resource = YouTubeResource(self.url, useproxy=use_proxy)
            except youtube_dl.utils.ExtractorError as e:
                if "unavailable" in str(e):
                    LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url)
                    return None

            if youtube_resource:
                try:
                    # Save YouTube info to JSON cache file
                    youtube_info = youtube_resource.get_resource_info(options)
                    if youtube_info:
                        json.dump(youtube_info,
                                  open(self.cache_path, 'w'),
                                  indent=4,
                                  ensure_ascii=False,
                                  sort_keys=True)
                    else:
                        LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__())
                except Exception as e:
                    LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e)
                    return None
        return youtube_info
Example #4
0
    def process(self):
        if 'fonts' in self.link:  # Omit google fonts
            self.tag.decompose()
            return

        # Parse urls in css (using parseString because it is much faster than parseUrl)
        style_sheet = downloader.read(self.link).decode('utf-8-sig',
                                                        errors='ignore')
        sheet = cssutils.parseString(style_sheet)
        for css_url in cssutils.getUrls(sheet):
            if not css_url.startswith('data:image') and not css_url.startswith(
                    'data:application'):
                try:
                    style_sheet = style_sheet.replace(
                        css_url,
                        os.path.basename(
                            self.write_url(css_url,
                                           url=self.link,
                                           default_ext='.png')))
                except BROKEN_EXCEPTIONS as e:
                    LOGGER.warn(
                        'Unable to download stylesheet url at {} ({})'.format(
                            self.url, str(e)))

        self.tag[self.attribute] = self.format_url(
            self.write_contents(self.get_filename(self.link), style_sheet))
        return self.tag[self.attribute]
Example #5
0
 def is_media_file(self, url):
     """
     Makes a HEAD request for `url` and reuturns (vertict, head_response),
     where verdict is True if `url` points to a media file (.pdf, .docx, etc.)
     """
     head_response = self.make_request(url, method='HEAD')
     if head_response:
         content_type = head_response.headers.get('content-type', None)
         if not content_type:
             LOGGER.warning(
                 'HEAD response does not have `content-type` header. url = '
                 + url)
             return (False, None)
         if content_type in self.MEDIA_CONTENT_TYPES:
             return (True, head_response)
         else:
             return (False, head_response)
     else:
         LOGGER.warning('HEAD request failed for url ' + url)
         # Fallback strategy: try to guess if media link based on extension
         for media_ext in self.MEDIA_FILE_FORMATS:
             if url.endswith('.' + media_ext):
                 return (True, None)
         # if all else fails, assume False
         return (False, None)
def scrape_video_menu(url):
    """ Scrape videos from url
        Args:
            url (str): url to scrape from (e.g. https://www.exploratorium.edu/video/subjects)
        Returns TopicNode containing all videos
    """
    LOGGER.info("SCRAPING VIDEOS...")
    video_topic = nodes.TopicNode(title="Videos",
                                  source_id="main-topic-videos")
    contents = BeautifulSoup(read(url), 'html5lib')

    for subject in contents.find_all('div', {'class': 'subject'}):
        title = subject.find('div', {
            'class': 'name'
        }).text.strip().replace("’", "'")
        LOGGER.info("    {}".format(title))
        topic = nodes.TopicNode(
            title=title,
            source_id="videos-{}".format(title),
            thumbnail=get_thumbnail_url(subject.find('img')['src']),
        )
        video_topic.add_child(topic)
        scrape_video_subject(subject.find('a')['href'], topic)

    return video_topic
Example #7
0
def get_subtitle_languages(youtube_id):
    """
    Returns a list of the subtitle language codes available for a given video.
    We'll try to get the list using two approach:
    1. The Youtube API (works for public videos when YOUTUBE_API_KEY defined)
    2. Slow by using YouTubeResource, which in turn calls youtube_dl
    """
    # Check if we already have the lang_codes list for this youtube_id cached...
    cache_filename = '{}__lang_codes.json'.format(youtube_id)
    cache_filepath = os.path.join(SUBTITLE_LANGUAGES_CACHE_DIR, cache_filename)
    if os.path.exists(cache_filepath):  # Cache hit!
        with open(cache_filepath) as jsonf:
            cache_data = json.load(jsonf)
            return cache_data['lang_codes']

    if YOUTUBE_API_KEY:
        try:
            lang_codes = get_subtitles_using_api(youtube_id)
            return lang_codes
        except HttpError as e:
            LOGGER.info("Can't access API for video {} ...".format(youtube_id))
    lang_codes = get_subtitles_using_youtube_dl(youtube_id)

    # Cache the results in chefdata/sublangscache/{youtube_id}__lang_codes.json
    cache_data = {"lang_codes": lang_codes}
    with open(cache_filepath, 'w') as jsonf:
        json.dump(cache_data, jsonf, ensure_ascii=True)

    return lang_codes
Example #8
0
 def recursive_extract_website_games(subtree):
     """
     Processes all child nodes of the subtree then calls itself on any folder-like
     child nodes. Weird, I know, but it works so I'm not touching it.
     """
     if 'children' in subtree:
         # do processing
         new_children = []
         for child in subtree['children']:
             child_url = child['url']
             if child['kind'] == 'PrathamZipResource':
                 if is_website_game(child_url):
                     # extract all game names referenced in manual curation Excel file to process separately...
                     child_url = child_url.replace(
                         'https://www.prathamopenschool.org/CourseContent/Games/',
                         '')
                     child_url = child_url.replace(
                         'http://www.prathamopenschool.org/CourseContent/Games/',
                         '')
                     child['title_en'] = child_url.replace('.zip', '')
                     print('EXTRACTED game name', child['title_en'],
                           'form url', child['url'])
                     website_games.append(child)
                 else:
                     # leave other games where they are
                     LOGGER.info('Undocumented game-like web resource ' +
                                 child['url'])
                     new_children.append(child)
             else:
                 # leave other content as is
                 new_children.append(child)
         #
         # recurse
         for child in subtree['children']:
             recursive_extract_website_games(child)
Example #9
0
    def pre_run(self, args, options):
        """
        Build the ricecooker json tree for the channel.
        The code here is similar to the code in `ricecooker_channel/chef.py`, but
        the channel hiearachy is build using dictionary objects instead of classes.
        """
        LOGGER.info('In pre_run...')

        # 1. Create the channel tree
        ricecooker_json_tree = dict(
            title='Sample JSON channel',
            source_domain='source.org',
            source_id='sample-json-channel',
            description='This channel was created from the files in the content/ ' \
                + 'directory and the metadata in sample_ricecooker_json_tree.json',
            thumbnail='./content/sample-json-channel-files/channel_thumbnail.jpg',
            language='en',
            children=[],
        )
        # The root object of the ricecooker json tree contains the channel info;
        # add topic and content nodes and to the children list to build the tree.

        # 2. Add topics nodes and content nodes as to the tree
        self.create_content_nodes(ricecooker_json_tree)
        self.create_exercise_nodes(ricecooker_json_tree)

        # 3. Save the tree to chefdata/trees/sample_ricecooker_json_tree.json
        json_tree_path = self.get_json_tree_path()
        write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
        LOGGER.info('Finished writing ricecooker json tree.')
Example #10
0
def website_game_webresouce_to_ricecooker_node(lang, web_resource):
    """
    Create Ricecooker Json structure for game from web resource dict `web_resource`.
    """
    game_node = dict(
        kind=content_kinds.HTML5,
        source_id=web_resource['source_id'],
        language=lang,
        title=web_resource['title'],
        description='source_url=' + web_resource['url'] if DEBUG_MODE else '',
        license=PRADIGI_LICENSE,
        thumbnail=web_resource.get('thumbnail_url'),
        files=[],
    )
    zip_tmp_path = get_zip_file(web_resource['url'], web_resource['main_file'])
    if zip_tmp_path:
        zip_file = dict(
            file_type=file_types.HTML5,
            path=zip_tmp_path,
            language=lang,
        )
        game_node['files'].append(zip_file)
        LOGGER.debug('Created HTML5AppNode for game ' + web_resource['title'])
        return game_node
    else:
        LOGGER.error('Failed to create zip for game at url=' +
                     web_resource['url'])
        return None
def make_request(url, *args, **kwargs):
    response = sess.get(url, *args, **kwargs)
    if response.status_code != 200:
        LOGGER.debug("NOT FOUND:" + url)
    elif not response.from_cache:
        LOGGER.debug("NOT CACHED:" + url)
    return response
Example #12
0
    def on_page(self, url, page, context):
        """
        Basic handler that appends current page to parent's children list and
        adds all links on current page to the crawling queue.
        """
        LOGGER.debug('in on_page ' + url)
        page_dict = dict(
            kind='PageWebResource',
            url=url,
            children=[],
        )
        page_dict.update(context)

        # attach this page as another child in parent page
        context['parent']['children'].append(page_dict)

        links = page.find_all('a')
        for i, link in enumerate(links):
            if link.has_attr('href'):
                link_url = urljoin(url, link['href'])
                if self.should_ignore_url(link_url):
                    pass
                    # Uncomment three lines below for debugging to record ignored links
                    # ignored_rsrc_dict = self.create_ignored_url_dict(link_url)
                    # ignored_rsrc_dict['parent'] = page_dict
                    # page_dict['children'].append(page_dict)
                else:
                    self.enqueue_url_and_context(link_url,
                                                 {'parent': page_dict})
            else:
                pass
def load_pradigi_structure(which=None):
    csv_path = download_structure_csv(which=which)
    struct_list = []
    with open(csv_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile,
                                fieldnames=PRADIGI_SHEET_CSV_FILEDNAMES)
        next(reader)  # Skip Headers row
        next(reader)  # Skip info line
        for row in reader:
            clean_row = _clean_dict(row)
            if clean_row[SUBJECT_KEY] is None:
                continue  # skip blank lines (identified by missing subject col)
            if clean_row[AGE_GROUP_KEY] in PRADIGI_AGE_GROUPS and clean_row[
                    SUBJECT_KEY] in PRADIGI_SUBJECTS:
                resource_type = clean_row[RESOURCE_TYPE_KEY]
                if resource_type == 'Game' and clean_row[GAMENAME_KEY]:
                    # make sure Game Name is present when specifying a game
                    struct_list.append(clean_row)
                elif resource_type == 'Website Resources':
                    struct_list.append(clean_row)
                else:
                    LOGGER.warning('Problem with structure row {}'.format(
                        str(clean_row)))
            else:
                LOGGER.warning('Unrecognized structure row {}'.format(
                    str(clean_row)))
    return struct_list
Example #14
0
def add_subpages_from_wikipedia_list(topic, list_url):
    """ add_subpages_from_wikipedia_list: Parses wiki pages and creates corresponding files
        To understand how the following parsing works, look at:
            1. the source of the page (e.g. https://en.wikipedia.org/wiki/List_of_citrus_fruits), or inspect in chrome dev tools
            2. the documentation for BeautifulSoup version 4: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
    """
    page = read_source(list_url)        # Parse the the page into BeautifulSoup format, so we can loop through and manipulate it
    table = page.find("table")          # Extract the main table from the page

    # Loop through all the rows in the table
    for row in table.find_all("tr"):
        columns = row.find_all("td")    # Extract the columns (cells, really) within the current row
        if not columns:                 # Some rows are empty, so just skip
            continue

        link = columns[0].find("a")     # Get the link to the subpage
        if not link:                    # Some rows don't have links, so skip
            continue

        # Extract the URL and title for the subpage
        url = make_fully_qualified_url(link["href"])
        title = link.text
        LOGGER.info("      Writing {}...".format(title))

        # Attempt to extract a thumbnail for the subpage, from the second column in the table
        image = columns[1].find("img")
        thumbnail_url = make_fully_qualified_url(image["src"]) if image else None
        if thumbnail_url and not (thumbnail_url.endswith("jpg") or thumbnail_url.endswith("png")):
            thumbnail_url = None

        # Download the wikipedia page into an HTML5 app node
        html5app = download_wikipedia_page(url, thumbnail=thumbnail_url, title=title)

        # Add the downloaded HTML5 app node into the topic
        topic.add_child(html5app)
Example #15
0
def apply_corrections_by_node_id(api, channel_tree, channel_id, corrections_by_node_id):
    """
    Given a dict `corrections_by_node_id` of the form,
    {
        'nodes_modified': {
            '<node_id (str)>': { modification dict1 },
            '<node_id (str)>': { modification dict2 },
        }
        'nodes_added': {
            '<node_id (str)>': { 'new_parent': (str),  'attributes': {...}},
        },
        'nodes_deleted': {
            '<node_id (str)>': {'old_parent': (str), 'attributes': {...}},
        },
        'nodes_moved': {
            '<node_id (str)>': {'old_parent': (str), 'new_parent': (str), 'attributes': {...}},
        },
    }
    this function will make the appropriate Studio API calls to apply the patch.
    """
    LOGGER.debug('Applying corrections...')
    #
    # Modifications
    for node_id, modifications_dict in corrections_by_node_id['nodes_modified'].items():
        apply_modifications_for_node_id(api, channel_tree, node_id, modifications_dict)
    #
    # Deletions
    for node_id, deletion_dict in corrections_by_node_id['nodes_deleted'].items():
        apply_deletion_for_node_id(api, channel_tree, channel_id, node_id, deletion_dict)
Example #16
0
    def construct_channel(self, *args, **kwargs):
        """ construct_channel: Creates ChannelNode and build topic tree

            Solar Spell is organized with the following hierarchy(Sample):
                Creative Arts (source_id = dir-creative-arts)
                |--- Culinary Arts (source_id = dir-culinary-arts)
                |--- |--- Real Pasifik 2 introducing Chef Alexis Tahiapuhe of Tahiti (source_id = file-real pasifik 2 introducing chef lela bolobolo of fiji.mp4)
                |--- Pacific Islands Arts and Culture(source_id = dir_pacific_islands_arts_and_culture)
                |--- |--- Cook Islands National Cultural Policy 10 July 2017_final english (File)
                |--- Teaching Resources and Classroom Activities
                Environment (source_id = dir-environment)
                |--- Adapting to Climate Change
                |--- |--- Action Against Climate Change Tuvalu Water and climate change
                |--- Climate Change Info                
                |--- |--- Animated Pacific Island Climate Change Videos
                ...
            Returns: ChannelNode
        """
        LOGGER.info("Constructing channel from {}...".format(BASE_URL))
        channel = self.get_channel(
            *args,
            **kwargs)  # Creates ChannelNode from data in self.channel_info
        LOGGER.info('   Writing {} Folder...'.format(CHANNEL_NAME))
        endpoint = BASE_URL + "content/"
        scrape_content(endpoint, channel)
        raise_for_invalid_channel(
            channel)  # Check for errors in channel construction
        return channel
Example #17
0
def keep_folder(raw_path):
    keep = True
    for pattern in DIR_EXCLUDE_PATTERNS:
        if pattern in raw_path:
            LOGGER.debug('rejecting', raw_path)
            keep = False
    return keep
def scrape_snack_subject(slug, topic):
    """ Scrape snack subject page
        Args:
            slug (str): url slug to scrape from (e.g. /subject/arts)
            topic (TopicNode): topic to add html nodes to
    """
    contents = BeautifulSoup(read(slug), 'html5lib')

    for activity in contents.find_all('div', {'class': 'activity'}):
        LOGGER.info("        {}".format(activity.find('h5').text.strip()))
        # Scrape snack pages into zips
        write_to_path, tags = scrape_snack_page(activity.find('a')['href'])
        if not write_to_path:
            continue

        # Create html node
        description = activity.find('div', {'class': 'pod-description'})
        topic.add_child(
            nodes.HTML5AppNode(
                source_id=activity.find('a')['href'],
                title=activity.find('h5').text.strip().replace("’", "'"),
                description=description.text.strip() if description else "",
                license=LICENSE,
                copyright_holder=COPYRIGHT_HOLDER,
                files=[files.HTMLZipFile(path=write_to_path)],
                thumbnail=get_thumbnail_url(activity.find('img')['src']),
                tags=tags,
            ))

    # Scrape next page (if any)
    next_page_url = get_next_page_url(contents)
    if next_page_url:
        scrape_snack_subject(next_page_url, topic)
Example #19
0
 def get_nodes_by_ids_complete(self, studio_id):
     headers = {"Authorization": "Token {0}".format(self.token)}
     url = NODES_ENDPOINT + studio_id
     LOGGER.info('  GET ' + url)
     response = requests.get(url, headers=headers)
     studio_node = response.json()[0]
     return studio_node
Example #20
0
    def construct_channel(self, *args, **kwargs):
        """ construct_channel: Creates ChannelNode and build topic tree

            Wikipedia is organized with the following hierarchy:
                Citrus (Folder)
                |   Citrus Page HTML Zip (File)
                Potatoes (Folder)
                |   Potatoes Page HTML Zip (File)

            Returns: ChannelNode
        """
        LOGGER.info("Constructing channel from {}...".format(BASE_URL))

        channel = self.get_channel(
            *args,
            **kwargs)  # Creates ChannelNode from data in self.channel_info

        create_topic(channel, "Citrus!",
                     "List_of_citrus_fruits")  # Add Citrus folder
        create_topic(channel, "Potatoes!",
                     "List_of_potato_cultivars")  # Add Potatoes folder

        raise_for_invalid_channel(
            channel)  # Check for errors in channel construction

        return channel
Example #21
0
    def write_question_row_from_question_dict(self, source_id, question_dict):
        file_path = get_metadata_file_path(self.channeldir, self.questionsinfo)
        with open(file_path, 'a') as csv_file:
            csvwriter = csv.DictWriter(csv_file, EXERCISE_QUESTIONS_INFO_HEADER)

            def _safe_list_get(l, idx, default):
                try:
                    return l[idx]
                except IndexError:
                    return default

            # change image links to local
            question_dict = self._make_local_question_images(question_dict)

            type_lookup = {
                'single_selection': exercises.SINGLE_SELECTION,
                'true_false': exercises.SINGLE_SELECTION,
                'multiple_selection': exercises.MULTIPLE_SELECTION,
                'input_question': exercises.INPUT_QUESTION,
            }

            # ANSWERS
            answers = json.loads(question_dict['answers'])
            options = []  # all options
            correct = []  # correct andwers
            for ans in answers:
                options.append(ans['answer'])
                if ans['correct']:
                    correct.append(ans['answer'])
            extra_options = DEFAULT_EXTRA_ITEMS_SEPARATOR.join(options[5:])

            # HINTS
            hints_raw = json.loads(question_dict['hints'])
            if hints_raw:
                raise ValueError('Found hints but not handled..')

            LOGGER.info('     - writing question with studio_id=' + question_dict['assessment_id'])
            question_row = {}
            question_row[EXERCISE_SOURCEID_KEY] = source_id
            question_row[EXERCISE_QUESTIONS_QUESTIONID_KEY] = question_dict['assessment_id'] # question_dict['assessment_id']
            question_row[EXERCISE_QUESTIONS_TYPE_KEY] = type_lookup[question_dict['type']]
            question_row[EXERCISE_QUESTIONS_QUESTION_KEY] = question_dict['question']
            question_row[EXERCISE_QUESTIONS_OPTION_A_KEY] = _safe_list_get(options, 0, None)
            question_row[EXERCISE_QUESTIONS_OPTION_B_KEY] = _safe_list_get(options, 1, None)
            question_row[EXERCISE_QUESTIONS_OPTION_C_KEY] = _safe_list_get(options, 2, None)
            question_row[EXERCISE_QUESTIONS_OPTION_D_KEY] = _safe_list_get(options, 3, None)
            question_row[EXERCISE_QUESTIONS_OPTION_E_KEY] = _safe_list_get(options, 4, None)
            question_row[EXERCISE_QUESTIONS_OPTION_FGHI_KEY] = extra_options
            question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY] = _safe_list_get(correct, 0, None)
            question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY] = _safe_list_get(correct, 1, None)
            question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY] = _safe_list_get(correct, 2, None)
            question_row[EXERCISE_QUESTIONS_HINT_1_KEY] = None # TODO
            question_row[EXERCISE_QUESTIONS_HINT_2_KEY] = None # TODO
            question_row[EXERCISE_QUESTIONS_HINT_3_KEY] = None # TODO
            question_row[EXERCISE_QUESTIONS_HINT_4_KEY] = None # TODO
            question_row[EXERCISE_QUESTIONS_HINT_5_KEY] = None # TODO
            question_row[EXERCISE_QUESTIONS_HINT_6789_KEY] = None # TODO
            # WRITE QUESTION ROW
            csvwriter.writerow(question_row)
Example #22
0
def process_folder(channel, rel_path, filenames, metadata_provider):
    """
    Create `ContentNode`s from each file in this folder and the node to `channel`
    under the path `rel_path`.
    """
    LOGGER.debug('IN process_folder ' + str(rel_path) + '     ' + str(filenames))
    if not keep_folder(rel_path):
        return

    chan_path = chan_path_from_rel_path(rel_path, metadata_provider.channeldir)
    chan_path_tuple = path_to_tuple(chan_path)
    chan_path_list = list(chan_path_tuple)
    LOGGER.debug('chan_path_list=' + str(chan_path_list))

    # FIND THE CONTAINING NODE (channel or topic)
    if len(chan_path_list) == 1:
        # CASE CHANNEL ROOT: `rel_path` points to `channeldir`
        # No need to create a topic node here since channel already exists
        containing_node = channel  # attach content nodes in filenames directly to channel

    else:
        # CASE TOPIC FOLDER: `rel_path` points to a channelroot subfolder (a.k.a TopicNode)
        dirname = chan_path_list.pop()  # name of the folder (used as ID for internal lookup)
        topic_parent_node = get_topic_for_path(channel, chan_path_list)

        # read topic metadata to get title and description for the TopicNode
        topic_metadata = metadata_provider.get(chan_path_tuple)
        thumbnail_chan_path =  topic_metadata.get('thumbnail_chan_path', None)
        if thumbnail_chan_path:
            thumbnail_rel_path = rel_path_from_chan_path(thumbnail_chan_path, metadata_provider.channeldir)
        else:
            thumbnail_rel_path = None
        # create TopicNode for this folder
        topic = dict(
            kind=TOPIC_NODE,
            dirname=dirname,
            source_id='sourceid:' + rel_path,
            title=topic_metadata.get('title', dirname),
            description=topic_metadata.get('description', None),
            author=topic_metadata.get('author', None),
            language=topic_metadata.get('language', None),
            license=topic_metadata.get('license', None),
            thumbnail=thumbnail_rel_path,
            children=[],
        )
        topic_parent_node['children'].append(topic)
        containing_node = topic  # attach content nodes in filenames to the newly created topic

    # filter filenames
    filenames_cleaned = filter_filenames(filenames)
    filenames_cleaned2 = filter_thumbnail_files(chan_path, filenames_cleaned, metadata_provider)

    # PROCESS FILES
    for filename in filenames_cleaned2:
        chan_filepath = os.path.join(chan_path, filename)
        chan_filepath_tuple = path_to_tuple(chan_filepath)
        metadata = metadata_provider.get(chan_filepath_tuple)
        node = make_content_node(metadata_provider.channeldir, rel_path, filename, metadata)
        containing_node['children'].append(node)  # attach content node to containing_node
Example #23
0
 def to_tag(self, filename=None):
     try:
         img = self.create_tag('img')
         img['src'] = self.to_zip(filename=filename)
         return img
     except BROKEN_EXCEPTIONS as e:
         LOGGER.error(str(e))
         return self.create_broken_link_message(self.url)
Example #24
0
    def construct_channel(self, *args, **kwargs):
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        lang_names = list(self.data.keys())
        lang_names.sort()

        for lang_name in lang_names:
            lang_data = self.data[lang_name]
            LOGGER.info("Creating app for language: {}".format(lang_name))
            lang = languages.getlang_by_native_name(lang_name)

            zip_dir = self.client.create_zip_dir_for_page(lang_data['url'])

            soup = self.client.get_page_soup(lang_data['url'])

            # Remove the translation list if found
            translations = soup.find('div', {'id': 'translations'})
            if translations:
                translations.extract()

            # Grab the localized title
            title = soup.find('span', {'id': 'share_title'}).text

            # Save the modified index.html page
            thumbnail = None
            for resource in lang_data['resources']:
                if 'dp3t.png' in resource:
                    thumbnail = os.path.join(zip_dir, resource)
                    break

            with open(os.path.join(zip_dir, 'index.html'), 'wb') as f:
                f.write(soup.prettify(encoding='utf-8'))

            # create_predictable_zip ensures that the ZIP file does not change each time it's created. This
            # ensures that the zip doesn't get re-uploaded just because zip metadata changed.
            zip_file = zip.create_predictable_zip(zip_dir)
            zip_name = lang.primary_code if lang else lang_name
            zip_filename = os.path.join(self.ZIP_DIR,
                                        "{}.zip".format(zip_name))
            os.makedirs(os.path.dirname(zip_filename), exist_ok=True)
            os.rename(zip_file, zip_filename)

            topic = nodes.TopicNode(source_id=lang_name, title=lang_name)
            zip_node = nodes.HTML5AppNode(
                source_id="covid19-sim-{}".format(lang_name),
                title=title,
                files=[files.HTMLZipFile(zip_filename)],
                license=licenses.PublicDomainLicense(
                    "Marcel Salathé & Nicky Case"),
                language=lang,
                thumbnail=thumbnail)
            topic.add_child(zip_node)
            channel.add_child(topic)

        return channel
def download_resource(endpoint):
    try:
        url = '{}{}'.format(BASE_URL, endpoint.lstrip('/'))
        filename, ext = os.path.splitext(endpoint)
        filename = '{}.zip'.format(filename.lstrip('/').replace('/', '-'))
        write_to_path = CeibalPageScraper(url, locale='es').to_file(filename=filename, directory=DOWNLOAD_DIRECTORY)
        return write_to_path
    except Exception as e:
        LOGGER.error(str(e))
Example #26
0
def keep_folder(raw_path):
    """
    Keep only folders that don't contain patterns in `DIR_EXCLUDE_PATTERNS`.
    """
    keep = True
    for pattern in DIR_EXCLUDE_PATTERNS:
        if pattern in raw_path:
            LOGGER.debug('rejecting', raw_path)
            keep = False
    return keep
Example #27
0
 def to_tag(self, filename=None):
     try:
         embed = self.create_tag('embed')
         embed['src'] = self.to_zip(filename=filename)
         embed['width'] = '100%'
         embed['style'] = 'height: 500px;max-height: 100vh;'
         return embed
     except BROKEN_EXCEPTIONS as e:
         LOGGER.error(str(e))
         return self.create_broken_link_message(self.url)
Example #28
0
def make_fully_qualified_url(url):
    if url.startswith("//"):
        return "https:" + url
    if url.startswith("/"):
        return "https://en.wikipedia.org" + url
    if not url.startswith("http"):
        LOGGER.warning("Skipping bad URL (relative to unknown location): " +
                       url)
        return None
    return url
    def _download_file(self, write_to_path):

        with html_writer.HTMLWriter(write_to_path) as zipper:
            try:
                self.zipper = zipper
                self.to_zip(filename='index.html')
            except Exception as e:
                # Any errors here will just say index.html file does not exist, so
                # print out error for more descriptive debugging
                LOGGER.error(str(e))
Example #30
0
def transform_video_vertical(vertical, parent_title=None):
    if 'children' not in vertical:
        return None, []

    # 1. LOOK FOR AN OPTIONAL html PREFIX TO USE AS DESCRIPTION
    description = ''
    # Extract an optional description from the first html node
    first_child = vertical['children'][0]
    if first_child['kind'] == 'html':
        description = extract_text_from_html_item(first_child,
                                                  translate_from='ar')

    if parent_title:
        video_title = parent_title + ' ' + vertical['display_name']
    else:
        video_title = vertical['display_name']

    # 2. GET THE VIDEO
    videos = [ch for ch in vertical['children'] if ch['kind'] == 'video']
    assert len(videos) == 1, 'multiple videos found'
    video = videos[0]
    video_dict = dict(kind=content_kinds.VIDEO,
                      source_id=video.get('youtube_id') or video.get('path'),
                      title=video_title,
                      author='Edraak',
                      description=description,
                      language=getlang('ar').code,
                      license=EDRAAK_LICENSE,
                      files=[])
    if 'youtube_id' in video:
        file_dict = dict(
            file_type=content_kinds.VIDEO,
            youtube_id=video['youtube_id'],
            language=getlang('ar').code,
            high_resolution=False,
        )
    elif 'path' in video:
        file_dict = dict(
            file_type=content_kinds.VIDEO,
            path=video['path'],
            language=getlang('ar').code,
            ffmpeg_settings={"crf": 24},
        )
    else:
        LOGGER.error('Video does not have youtube_id or path ' + str(video))
    video_dict['files'].append(file_dict)

    # 3. LOOK FOR AN OPTIONAL RESOURCES html
    downloadable_resources = []
    htmls = [ch for ch in vertical['children'] if ch['kind'] == 'html']
    for html in htmls:
        if 'downloadable_resources' in html:
            downloadable_resources.extend(html['downloadable_resources'])

    return video_dict, downloadable_resources