Ejemplo n.º 1
0
 def _is_likely_global_nav(url):
     """
     Returns True if `url` is likely a global nav link based on how often seen in pages.
     """
     seen_count = self.global_urls_seen_count[url]
     if debug:
         LOGGER.debug('seen_count/total_urls_seen_count=' +
                      str(float(seen_count) / total_urls_seen_count) +
                      '=' + str(seen_count) + '/' +
                      str(total_urls_seen_count) +
                      self.url_to_path(url))
     # if previously determined to be a global nav link
     for global_nav_resource in global_nav_nodes['children']:
         if url == global_nav_resource['url']:
             return True
     # if new link that is seen a lot
     if float(seen_count
              ) / total_urls_seen_count > self.GLOBAL_NAV_THRESHOLD:
         return True
     return False
def scrape_content_page(content_page_url, lang):
    """
    Download standalone HTML content pages (non-modules).
    Used for "Curriculum framework" and standalone pages in "Resources".
    Returns:
        page_info (dict):  info necessary to constructing HTML5AppNode and HTMLZipFile
          - title
          - source_id
          - description
          - zip_path
    """
    LOGGER.debug('Scrapring content page @ url = ' + str(content_page_url))
    doc = get_parsed_html_from_url(content_page_url)

    destination = tempfile.mkdtemp()
    print('destination=', destination)

    source_id = parse_qs(urlparse(content_page_url).query)['id'][0]
    raw_title = doc.select_one("head title").text
    content_title = raw_title.replace('OLCreate:', '')\
            .replace('TESSA_ARABIC', '')\
            .replace('TESSA_Eng', '')\
            .replace('TESSA_Fr', '')\
            .strip()

    page_info = dict(
        lang=lang,
        source_id=source_id,
        title=content_title,
        description=None,
        children=[],
    )

    # Do the actual download
    download_page(content_page_url, destination, 'index.html', lang)

    # zip it
    page_info['zip_path'] = create_predictable_zip(destination)

    # ship it
    return page_info
    def on_special_subtopic_page(self, url, page, context):
        LOGGER.debug('     in on_special_subtopic_page ' + url)
        page_dict = dict(
            kind='special_subtopic_page',  # redundant... -- mismatc with original special_subtopic_page
            url=url,
            children=[],
        )
        page_dict.update(context)
        context['parent']['children'].append(page_dict)
        try:
            menu_row = page.find('div', {'id': 'body-row'})
            menu_row = menu_row.find('div', {'class': 'col-md-2'})
            print(str(menu_row))
        except Exception as e:
            LOGGER.error('on_subtopic_page: %s : %s' % (e, page))
            return
        for link in menu_row.find_all('a', {'class': 'list-group-item'}):
            try:
                title = link.get_text().strip()
                description = ''
                lesson_url = urljoin(url, link['href'])

                if self.should_ignore_url(lesson_url):
                    LOGGER.info('ignoring lesson' + lesson_url)
                    continue

                source_id = get_source_id(link['href'])
                LOGGER.debug('         special lesson: %s: %s' % (source_id, title))
                context = dict(
                    parent=page_dict,
                    kind='fun_page',
                    title=title,
                    description=description,
                    source_id=source_id,
                    thumbnail_url=None,
                    children=[],
                )
                self.enqueue_url_and_context(lesson_url, context)
                # get_contents(node, link)
            except Exception as e:
                LOGGER.error('on_special_subtopic_page: %s : %s' % (e, link))
Ejemplo n.º 4
0
    def generate_contentinfo_from_folder(self, csvwriter, rel_path, filenames):
        """
        Create a topic node row in Content.csv for the folder at `rel_path` and
        add content node rows for all the files in the `rel_path` folder.
        """
        LOGGER.debug('IN process_folder ' + str(rel_path) + '     ' + str(filenames))
        from ricecooker.utils.linecook import filter_filenames, filter_thumbnail_files, chan_path_from_rel_path

        # WRITE TOPIC ROW
        topicrow = self.channeldir_node_to_row( rel_path.split(os.path.sep) )
        csvwriter.writerow(topicrow)

        # WRITE CONTENT NODE ROWS
        chan_path = chan_path_from_rel_path(rel_path, self.channeldir)
        filenames_cleaned = filter_filenames(filenames)
        # filenames_cleaned2 = filter_thumbnail_files(chan_path, filenames_cleaned, self)
        for filename in filenames_cleaned:
            path_tuple = rel_path.split(os.path.sep)
            path_tuple.append(filename)
            filerow = self.channeldir_node_to_row(path_tuple)
            csvwriter.writerow(filerow)
    def on_topic_page(self, url, page, context):
        LOGGER.debug('in on_topic_page ' + url)
        page_dict = dict(
            kind='topic_page',
            url=url,
            children=[],
        )
        page_dict.update(context)
        context['parent']['children'].append(page_dict)

        try:
            body_row = page.find('div', {'id': 'body-row'})
            menu_row = body_row.find('div', {'class': 'col-md-2'})
            subtopics = menu_row.find_all('a')
        except Exception as e:
            LOGGER.error('ERROR get_subtopics: %s : %s' % (e, url))
            return
        for subtopic in subtopics:
            try:
                subtopic_url = urljoin(url, subtopic['href'])

                if self.should_ignore_url(subtopic_url):
                    print('ignoring subtopic', subtopic_url)
                    continue

                title = get_text(subtopic)
                source_id = get_source_id(subtopic['href'])
                LOGGER.debug('  found subtopic: %s: %s' % (source_id, title))
                context = dict(
                    parent=page_dict,
                    kind='subtopic_page',
                    title=title,
                    source_id=source_id,
                    children=[],
                )
                self.enqueue_url_and_context(subtopic_url, context)
            except Exception as e:
                LOGGER.error('on_topic_page: %s : %s' % (e, subtopic))
Ejemplo n.º 6
0
def download_zip_file(url):
    if not url:
        return (False, None)

    if get_suffix(url) != '.zip':
        return (False, None)

    response = sess.get(url)
    if response.status_code != 200:
        LOGGER.error("STATUS: {}, URL: {}", response.status_code, url)
        return (False, None)
    elif not response.from_cache:
        LOGGER.debug("NOT CACHED:", url)

    archive = zipfile.ZipFile(io.BytesIO(response.content))
    archive_members = list(
        filter(lambda f: f.filename.endswith('.pdf'), archive.infolist()))
    archive_member_names = [None] * len(archive_members)
    for i, pdf in enumerate(archive_members):
        path = os.path.join(PDFS_DATA_DIR, pdf.filename)
        archive_member_names[i] = path
        if not os.path.exists(path):
            archive.extract(pdf, PDFS_DATA_DIR)
    return (True, archive_member_names)
    def on_lang_page(self, url, page, context):
        LOGGER.debug('in on_lang_page ' + url)
        page_dict = dict(
            kind='lang_page',
            url=url,
            children=[],
        )
        page_dict.update(context)
        context['parent']['children'].append(page_dict)

        try:
            menu_row = page.find('div', {'id': 'menu-row'})
        except Exception as e:
            LOGGER.error('ERROR on_lang_page: %s : %s' % (e, url))
            return
        for topic in menu_row.find_all('a'):
            try:
                if topic['href'] == '#':
                    print('skipping', topic)
                    continue
                
                topic_url = urljoin(url, topic['href'].strip())
                print(topic_url)
                if self.should_ignore_url(topic_url):
                    print('ignoring topic', topic_url)
                    continue

                # metadata
                title = get_text(topic)
                source_id = get_source_id(topic['href'].strip())
                subject_en = source_id    # short string to match on top-level categories
                context = dict(
                    parent=page_dict,
                    title=title,
                    source_id=source_id,
                    subject_en=subject_en,
                )
                # print('in on_lang_page topic.title=', title, 'topic_subject_id=', source_id, 'subject_en=', subject_en)

                # what type of tab is it?
                if 'Fun' in topic['href']:
                    LOGGER.info('found fun page: %s: %s' % (source_id, title))
                    context['kind'] = 'fun_page'
                elif 'Story' in topic['href']:
                    LOGGER.info('found story page: %s: %s' % (source_id, title))
                    context['kind'] = 'story_page'
                elif any([cid in topic['href'] for cid in SPECIAL_SUBTOPIC_COURSE_IDS]):
                    LOGGER.info('FOUND three-tab special_subtopic_page page: %s: %s' % (source_id, title))
                    context['kind'] = 'special_subtopic_page'
                elif 'gamelist/CRS' in topic['href']:
                    LOGGER.info('found top-level CRS page: %s: %s' % (source_id, title))
                    context['kind'] = 'fun_page'
                else:
                    LOGGER.info('found topic: %s: %s' % (source_id, title))
                    context['kind'] = 'topic_page'
                self.enqueue_url_and_context(topic_url, context)
                # if DEBUG_MODE:
                #     return

            except Exception as e:
                LOGGER.error('on_lang_page: %s : %s' % (e, topic))
Ejemplo n.º 8
0
def process_folder(channel, rel_path, filenames, metadata_provider):
    """
    Create `ContentNode`s from each file in this folder and the node to `channel`
    under the path `rel_path`.
    """
    LOGGER.debug('IN process_folder ' + str(rel_path) + '     ' +
                 str(filenames))
    if not keep_folder(rel_path):
        return

    chan_path = chan_path_from_rel_path(rel_path, metadata_provider.channeldir)
    chan_path_tuple = path_to_tuple(chan_path)
    chan_path_list = list(chan_path_tuple)
    LOGGER.debug('chan_path_list=' + str(chan_path_list))

    # FIND THE CONTAINING NODE (channel or topic)
    if len(chan_path_list) == 1:
        # CASE CHANNEL ROOT: `rel_path` points to `channeldir`
        # No need to create a topic node here since channel already exists
        containing_node = channel  # attach content nodes in filenames directly to channel

    else:
        # CASE TOPIC FOLDER: `rel_path` points to a channelroot subfolder (a.k.a TopicNode)
        dirname = chan_path_list.pop(
        )  # name of the folder (used as ID for internal lookup)
        topic_parent_node = get_topic_for_path(channel, chan_path_list)

        # read topic metadata to get title and description for the TopicNode
        topic_metadata = metadata_provider.get(chan_path_tuple)
        thumbnail_chan_path = topic_metadata.get('thumbnail_chan_path', None)
        if thumbnail_chan_path:
            thumbnail_rel_path = rel_path_from_chan_path(
                thumbnail_chan_path, metadata_provider.channeldir)
        else:
            thumbnail_rel_path = None
        # create TopicNode for this folder
        topic = dict(
            kind=TOPIC_NODE,
            dirname=dirname,
            source_id='sourceid:' + rel_path,
            title=topic_metadata.get('title', dirname),
            description=topic_metadata.get('description', None),
            author=topic_metadata.get('author', None),
            language=topic_metadata.get('language', None),
            license=topic_metadata.get('license', None),
            thumbnail=thumbnail_rel_path,
            children=[],
        )
        topic_parent_node['children'].append(topic)
        containing_node = topic  # attach content nodes in filenames to the newly created topic

    # filter filenames
    filenames_cleaned = filter_filenames(filenames)
    filenames_cleaned2 = filter_thumbnail_files(chan_path, filenames_cleaned,
                                                metadata_provider)

    # PROCESS FILES
    for filename in filenames_cleaned2:
        chan_filepath = os.path.join(chan_path, filename)
        chan_filepath_tuple = path_to_tuple(chan_filepath)
        metadata = metadata_provider.get(chan_filepath_tuple)
        node = make_content_node(metadata_provider.channeldir, rel_path,
                                 filename, metadata)
        containing_node['children'].append(
            node)  # attach content node to containing_node
Ejemplo n.º 9
0
def transform_html_vertical(vertical, parent_title=None):
    """
    Parses the `html` children of the vertical to generate document nodes from
    linked pdfs, extract downloadable resources, or a standalone html5 app node
    of the html content for all other cases.
    Returns: nodes, downloadable_resources
    """
    if 'children' not in vertical:
        LOGGER.warning('found empty vertical' + str(vertical))
        return [], []

    assert all(ch['kind'] == 'html'
               for ch in vertical['children']), 'non htmls found'

    nodes = []
    downloadable_resources = []
    htmls = [ch for ch in vertical['children'] if ch['kind'] == 'html']

    for html in htmls:
        if 'downloadable_resources' in html and html['downloadable_resources']:
            LOGGER.debug('    found downloadable_resources')
            resources = html['downloadable_resources']
            for resource in resources:
                ext = resource['ext']
                if ext == 'pdf':
                    pdf_node = dict(
                        kind=content_kinds.DOCUMENT,
                        title=resource['title'],
                        description=resource.get('description', ''),
                        source_id=resource['relhref'],
                        license=EDRAAK_LICENSE,
                        language=getlang('ar').code,
                        files=[],
                    )
                    file_dict = dict(
                        file_type=file_types.DOCUMENT,
                        path=resource['relhref'],
                        language=getlang('ar').code,
                    )
                    pdf_node['files'].append(file_dict)
                    nodes.append(pdf_node)
                else:
                    downloadable_resources.append(resource)

        else:
            LOGGER.debug('    packaging html content')
            html5app_dict = dict(
                kind=content_kinds.HTML5,
                title=vertical['display_name'],
                # title=EDRAAK_STRINGS['downloadable_resources'],
                description=html.get('description', ''),
                source_id=html['url_name'],
                license=EDRAAK_LICENSE,
                language=getlang('ar').code,
                files=[],
            )
            zip_path = package_html_content_as_html5_zip_file(html)
            zip_file = dict(
                file_type=file_types.HTML5,
                path=zip_path,
                language=getlang('ar').code,
            )
            html5app_dict['files'].append(zip_file)
            nodes.append(html5app_dict)
        #
        return nodes, downloadable_resources
Ejemplo n.º 10
0
def transform_tree(clean_tree, coursedir):
    course_id = clean_tree['course']
    course_title = clean_tree['display_name']
    course_thumbnail = os.path.join(coursedir, 'static',
                                    clean_tree['course_image'])
    if not os.path.exists(course_thumbnail):
        course_image_with_spaces = clean_tree['course_image'].replace('_', ' ')
        course_thumbnail = os.path.join(coursedir, 'static',
                                        course_image_with_spaces)

    course_dict = dict(
        kind=content_kinds.TOPIC,
        title=course_title,
        thumbnail=course_thumbnail,
        source_id=course_id,
        description='',
        language=getlang('ar').code,
        license=EDRAAK_LICENSE,
        children=[],
    )

    for chapter in clean_tree['children']:
        chapter_dict = dict(
            kind=content_kinds.TOPIC,
            title=chapter['display_name'],
            source_id=chapter['url_name'],
            description='',
            language=getlang('ar').code,
            license=EDRAAK_LICENSE,
            children=[],
        )
        course_dict['children'].append(chapter_dict)
        chapter_downloadable_resources = []

        for sequential in chapter['children']:

            # SPECIAL CASE: skip empty parent nodes of discussions
            if len(sequential['children']) == 0:
                LOGGER.debug('Skipping empty sequential ' + str(sequential))
                continue

            # DEFAULT CASE: process as regular topic node
            sequential_dict = dict(
                kind=content_kinds.TOPIC,
                title=sequential['display_name'],
                source_id=sequential['url_name'],
                description=sequential.get('description', ''),
                language=getlang('ar').code,
                license=EDRAAK_LICENSE,
                children=[],
            )
            chapter_dict['children'].append(sequential_dict)

            for vertical in sequential['children']:
                vertical_type = guess_vertical_type(vertical)

                if vertical_type in [
                        'knowledge_check_vertical', 'test_vertical'
                ]:
                    exercise_dict = transform_vertical_to_exercise(vertical)
                    if exercise_dict:
                        sequential_dict['children'].append(exercise_dict)
                elif vertical_type == 'video_vertical':
                    video_dict, downloadable_resources = transform_video_vertical(
                        vertical)
                    if video_dict:
                        sequential_dict['children'].append(video_dict)
                    chapter_downloadable_resources.extend(
                        downloadable_resources)
                elif vertical_type == 'html_vertical':
                    nodes, downloadable_resources = transform_html_vertical(
                        vertical)
                    if nodes:
                        sequential_dict['children'].extend(nodes)
                    chapter_downloadable_resources.extend(
                        downloadable_resources)
                else:
                    LOGGER.debug('skipping ' + vertical_type + ' url_name=' +
                                 vertical['url_name'])

        #
        if chapter_downloadable_resources:
            LOGGER.debug('  Packaging chapter_downloadable_resources')
            source_id = chapter['url_name'] + '-downloadable-resources'
            html5app_dict = dict(
                kind=content_kinds.HTML5,
                title=EDRAAK_STRINGS['downloadable_resources'],
                description=EDRAAK_STRINGS[
                    'downloadable_resources_description'],
                source_id=source_id,
                license=EDRAAK_LICENSE,
                language=getlang('ar').code,
                files=[],
            )
            zip_path = make_html5zip_from_resources(
                chapter_downloadable_resources, basefilename=source_id + '2')
            zip_file = dict(
                file_type=file_types.HTML5,
                path=zip_path,
                language=getlang('ar').code,
            )
            html5app_dict['files'].append(zip_file)
            chapter_dict['children'].append(html5app_dict)

    flattened_course_dict = flatten_transformed_tree(course_dict)
    return flattened_course_dict
    def on_lesson_page(self, url, page, context):
        LOGGER.debug('      in on_lesson_page' + url)
        page_dict = dict(
            kind='lessons_page',
            url=url,
            children=[],
        )
        page_dict.update(context)
        context['parent']['children'].append(page_dict)

        try:
            menu_row = page.find('div', {'id': 'row-exu'})
        except Exception as e:
            LOGGER.error('on_lesson_page: %s : %s' % (e, page))
            return

        contents = menu_row.find_all('div', {'class': 'col-md-3'})
        for content in contents:
            try:
                title = get_text(content.find('div', {'class': 'txtline'}))
                # TODO: description
                thumbnail = content.find('a').find('img')['src']
                thumbnail = get_absolute_path(thumbnail)
                main_file, master_file, source_id = get_content_link(content)
                LOGGER.debug('         content: %s: %s' % (source_id, title))

                if self.should_ignore_url(main_file):
                    print('ignoring content', title, main_file)
                    continue
                if len(main_file) < 10:
                    print('something strage --- short main file url with   title - main_file - master_file ', title, '-', main_file, '-', master_file)

                if main_file.endswith('mp4') or main_file.endswith('MP4') or main_file.endswith('m4v'):
                    video = dict(
                        url=main_file,
                        kind='PrathamVideoResource',
                        description='source_url=' + main_file if DEBUG_MODE else '',
                        title=title,
                        source_id=source_id,
                        thumbnail_url=thumbnail,
                        children=[],
                    )
                    video.update(self.get_video_metadata(main_file))
                    page_dict['children'].append(video)

                elif main_file.endswith('pdf'):
                    pdf = dict(
                        url=main_file,
                        kind='PrathamPdfResource',
                        title=title,
                        description='source_url=' + main_file if DEBUG_MODE else '',
                        source_id=source_id,
                        thumbnail_url=thumbnail,
                        children=[],
                    )
                    page_dict['children'].append(pdf)

                elif main_file.endswith('html') and master_file.endswith('zip'):
                    if '.~' in master_file:
                        # Fix broken links of the form https://www.prathamopenschool.org/Gj/gamelist/CRS174/.~/CourseContent/Games/NumberKas_GJ.zip
                        pathels = master_file.split('/')
                        master_file = '/'.join(pathels[0:3] + pathels[7:])
                    zipfile = dict(
                        url=master_file,
                        kind='PrathamZipResource',
                        title=title,
                        description='source_url=' + master_file if DEBUG_MODE else '',
                        source_id=source_id,
                        thumbnail_url=thumbnail,
                        main_file=main_file,     # needed to rename to index.html if different
                        children=[],
                    )
                    page_dict['children'].append(zipfile)

                else:
                    LOGGER.error('ZZZZ>>> Content not supported: onpage=%s main_file=%s master_file=%s' % (url, main_file, master_file))
                    unsupported_rsrc = dict(
                        url=main_file,
                        referring_url=url,
                        kind='UnsupportedPrathamWebResource',
                        title=title,
                        source_id=source_id,
                        thumbnail_url=thumbnail,
                        children=[],
                    )
                    page_dict['children'].append(unsupported_rsrc)
                    
            except Exception as e:
                LOGGER.error('zz _process_contents: %s : %s' % (e, content))
Ejemplo n.º 12
0
def archive_page(url, download_root):
    """
    Download fully rendered page and all related assets into ricecooker's site archive format.

    :param url: URL to download
    :param download_root: Site archive root directory
    :return: A dict containing info about the page archive operation
    """

    os.makedirs(download_root, exist_ok=True)
    content, props = asyncio.get_event_loop().run_until_complete(load_page(url))

    parsed_url = urlparse(url)
    page_domain = parsed_url.netloc.replace(':', '_')

    # get related assets
    base_url = url[:url.rfind('/')]
    urls_to_replace = {}

    if content:
        def html5_derive_filename(url):
            return get_archive_filename(url, page_domain, download_root, urls_to_replace)
        download_static_assets(content, download_root, base_url, derive_filename=html5_derive_filename)

        for key in urls_to_replace:
            url_parts = urlparse(key)
            # When we get an absolute URL, it may appear in one of three different ways in the page:
            key_variants = [
                # 1. /path/to/file.html
                key.replace(url_parts.scheme + '://' + url_parts.netloc, ''),
                # 2. https://www.domain.com/path/to/file.html
                key,
                # 3. //www.domain.com/path/to/file.html
                key.replace(url_parts.scheme + ':', ''),
            ]

            orig_content = content
            for variant in key_variants:
                # searching within quotes ensures we only replace the exact URL we are
                # trying to replace
                # we avoid using BeautifulSoup because Python HTML parsers can be destructive and
                # do things like strip out the doctype.
                content = content.replace('="{}"'.format(variant), '="{}"'.format(urls_to_replace[key]))
                content = content.replace('url({})'.format(variant), 'url({})'.format(urls_to_replace[key]))

            if content == orig_content:
                LOGGER.debug("link not replaced: {}".format(key))
                LOGGER.debug("key_variants = {}".format(key_variants))

        download_dir = os.path.join(page_domain, parsed_url.path.split('/')[-1].replace('?', '_'))
        download_path = os.path.join(download_root, download_dir)
        os.makedirs(download_path, exist_ok=True)

        index_path = os.path.join(download_path, 'index.html')
        f = open(index_path, 'w', encoding='utf-8')
        f.write(content)
        f.close()

        page_info = {
            'url': url,
            'cookies': props['cookies'],
            'index_path': index_path,
            'resources': list(urls_to_replace.values())
        }

        return page_info

    return None
def _build_json_tree(parent_node, sourcetree, lang=None):
    # type: (dict, List[dict], str) -> None
    """
    Parse the web resource nodes given in `sourcetree` and add as children of `parent_node`.
    """
    # EXPECTED_NODE_TYPES = ['TessaLangWebRessourceTree', 'TessaCategory', 'TessaSubpage',
    #                        'TessaModule']
    for source_node in sourcetree:
        if 'kind' not in source_node:
            print('kind-less source_node', source_node)
            continue
        kind = source_node['kind']
        # if kind not in EXPECTED_NODE_TYPES:
        #     raise NotImplementedError('Unexpected web resource node type encountered.')

        if kind == 'TessaLangWebRessourceTree':
            # this is the root of the tree, no special attributes, just process children
            source_tree_children = source_node.get("children", [])
            _build_json_tree(parent_node, source_tree_children, lang=lang)

        elif kind == 'TessaSubpage':
            child_node = dict(
                kind=content_kinds.TOPIC,
                source_id=source_node['source_id'],
                title=source_node['title'],
                author='TESSA',
                description='',  # 'TODO description of ' + source_node['url'],
                thumbnail=source_node.get("thumbnail"),
                children=[],
            )
            parent_node['children'].append(child_node)
            LOGGER.debug('Created new TopicNode for TessaSubpage titled ' +
                         child_node['title'])
            source_tree_children = source_node.get("children", [])
            _build_json_tree(child_node, source_tree_children, lang=lang)

        elif kind == 'TessaAudioResourcesSubpage':
            child_node = dict(
                kind=content_kinds.TOPIC,
                source_id=source_node['source_id'],
                title=source_node['title'],
                author='TESSA',
                description='',  # 'TODO description of ' + source_node['url'],
                thumbnail=source_node.get("thumbnail"),
                children=[],
            )
            parent_node['children'].append(child_node)
            LOGGER.debug(
                'Created new TopicNode for TessaAudioResourcesSubpage titled '
                + child_node['title'])
            source_tree_children = source_node.get("children", [])
            _build_json_tree(child_node, source_tree_children, lang=lang)

        elif kind == 'TessaAudioResourceTopicSubpage':
            child_node = dict(
                kind=content_kinds.TOPIC,
                source_id=source_node['source_id'],
                title=source_node['title'],
                author='TESSA',
                description='',  # 'TODO description of ' + source_node['url'],
                thumbnail=source_node.get("thumbnail"),
                children=[],
            )
            parent_node['children'].append(child_node)
            LOGGER.debug(
                'Created new TopicNode for TessaAudioResourceTopicSubpage titled '
                + child_node['title'])
            source_tree_children = source_node.get("children", [])
            _build_json_tree(child_node, source_tree_children, lang=lang)

        elif kind == 'TessaAudioResourceSection':
            child_node = dict(
                kind=content_kinds.TOPIC,
                source_id=source_node['source_id'],
                title=source_node['title'],
                author='TESSA',
                description='',  # 'TODO description of ' + source_node['url'],
                thumbnail=source_node.get("thumbnail"),
                children=[],
            )
            parent_node['children'].append(child_node)
            LOGGER.debug(
                'Created new TopicNode for TessaAudioResourceSection titled ' +
                child_node['title'])
            source_tree_children = source_node.get("children", [])
            _build_json_tree(child_node, source_tree_children, lang=lang)

        elif kind == 'TessaModule':
            child_node = dict(
                kind=content_kinds.HTML5,
                source_id=source_node['source_id'],
                language=source_node['lang'],
                title=source_node['title'],
                description=
                '',  # 'fake descri', # TODO source_node['description']
                license=TESSA_LICENSE,
                files=[],
            )
            zip_path = download_module(source_node['url'],
                                       lang=source_node['lang'])
            module_html_file = dict(
                file_type=file_types.HTML5,
                path=zip_path,
                language=source_node['lang'],
            )
            child_node['files'] = [module_html_file]
            parent_node['children'].append(child_node)
            LOGGER.debug('Created HTML5AppNode for TessaModule titled ' +
                         child_node['title'])

        elif kind == 'TessaContentPage':
            page_info = scrape_content_page(source_node['url'], lang)
            child_node = dict(
                kind=content_kinds.HTML5,
                source_id=source_node['source_id'],
                language=source_node['lang'],
                title=source_node['title'],
                description=source_node.get('description', ''),
                license=TESSA_LICENSE,
                files=[],
            )
            module_html_file = dict(
                file_type=file_types.HTML5,
                path=page_info['zip_path'],
                language=source_node['lang'],
            )
            child_node['files'] = [module_html_file]
            parent_node['children'].append(child_node)
            LOGGER.debug('Created HTML5AppNode for TessaContentPage titled ' +
                         child_node['title'])

        elif kind == 'TessaAudioResouce':
            child_node = dict(
                kind=content_kinds.AUDIO,
                source_id=source_node['source_id'],
                language=source_node['lang'],
                title=source_node.get('title', 'NOTITLE'),
                description=
                '',  # 'fake descri', # TODO source_node['description']
                license=TESSA_LICENSE,
                files=[],
            )
            mp3_file = dict(
                file_type=file_types.AUDIO,
                path=source_node['url'],
                language=source_node['lang'],
            )
            child_node['files'] = [mp3_file]
            parent_node['children'].append(child_node)
            LOGGER.debug('Created AudioNode from file url ' +
                         source_node['url'])

        elif kind == 'TessaPDFDocument':
            child_node = dict(
                kind=content_kinds.DOCUMENT,
                source_id=source_node['source_id'],
                language=source_node['lang'],
                title=source_node.get('title', 'NOTITLE'),
                description=
                '',  # 'fake descri', # TODO source_node['description']
                license=TESSA_LICENSE,
                files=[],
            )
            pdf_file = dict(
                file_type=file_types.DOCUMENT,
                path=source_node['url'],
                language=source_node['lang'],
            )
            child_node['files'] = [pdf_file]
            parent_node['children'].append(child_node)
            LOGGER.debug('Created PDF Document Node from url ' +
                         source_node['url'])

        else:
            # LOGGER.critical("Encountered an unknown content node format.")
            print('***** Skipping content kind', source_node['kind'], 'titled',
                  source_node.get('title', 'NOTITLE'))
            continue

    return parent_node
def download_module_no_toc(module_url, lang=None):
    """
    Extracting the module table of contents from the sidebad nav doesn't work for certain modules in FR
    e.g. http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=105334&section=1.1

    If NO TOC is available, then we'll crawl pages one by one
    (`module_contents_dict`)
    """
    LOGGER.debug('Scrapring module @ url = ' + str(module_url))
    doc = get_parsed_html_from_url(module_url)
    destination = tempfile.mkdtemp()
    print('destination=', destination)

    # copy css/js/images from skel
    shutil.copytree('chefdata/templates/module_skel/styles',
                    os.path.join(destination, 'styles'))

    source_id = parse_qs(urlparse(module_url).query)['id'][0]
    raw_title = doc.select_one("head title").text
    module_title = raw_title.replace('OLCreate:', '')\
            .replace('TESSA_ARABIC', '')\
            .replace('TESSA_Eng', '')\
            .replace('TESSA_Fr', '')\
            .strip()

    module_contents_dict = dict(
        kind='TessaModuleContentsDict',
        source_id=source_id,
        title=module_title,
        lang=lang,
        children=[],
    )
    # print(module_contents_dict)

    # recusively download all sections by following "Next" links
    current_url = module_url
    current_section = None
    is_first_section = True
    while True:
        LOGGER.debug('processing current_url' + str(current_url))
        current_doc = get_parsed_html_from_url(current_url)

        # special handling for module-level page (no section in url but is really Section 1)
        if is_first_section:
            section_filename = 'section-1.html'
            is_first_section = False
        else:
            section_filename = get_section_filename(current_url)

        # Do the actual download
        download_section(current_url, destination, section_filename, lang)

        # Store section/subsecito info so we can build TOC later
        doc = get_parsed_html_from_url(current_url)
        raw_title = doc.select_one("head title").text
        the_title = raw_title.replace('OLCreate:', '')\
                .replace('TESSA_ARABIC', '')\
                .replace('TESSA_Eng', '')\
                .replace('TESSA_Fr', '')\
                .strip()

        # sections e.g. section-3.html
        if '_' not in section_filename:
            section_dict = dict(kind='TessaModuleContentsSection',
                                title=the_title,
                                href=current_url,
                                filename=section_filename,
                                children=[])
            module_contents_dict['children'].append(section_dict)
            print('  - section:', the_title[0:80])
            current_section = section_dict

        # subsections e.g. section-3_2.html
        else:
            subsection_title = the_title.replace(module_title, '')
            subsection_title.replace(current_section['title'], '')
            subsection_title = subsection_title.lstrip()
            if subsection_title.startswith(': '):
                subsection_title = subsection_title.replace(': ', '', 1)
            subsection_dict = dict(
                kind='TessaModuleContentsSubsection',
                title=subsection_title,
                href=current_url,
                filename=section_filename,
            )
            print('     - subsection:', subsection_title[0:80])
            current_section['children'].append(subsection_dict)

        # Recurse if next
        next_url = _get_next_section_url(current_doc)
        if next_url:
            current_url = next_url
        else:
            break

    # for debugging...
    # pp.pprint(module_contents_dict)

    module_index_tmpl = jinja2.Template(
        open('chefdata/templates/module_index.html').read())
    index_contents = module_index_tmpl.render(module=module_contents_dict)
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(index_contents)

    # return module_contents_dict
    zip_path = create_predictable_zip(destination)
    return zip_path
def download_module(module_url, lang=None):
    LOGGER.debug('Scrapring module @ url = ' + module_url)
    doc = get_parsed_html_from_url(module_url)
    source_id = parse_qs(urlparse(module_url).query)['id'][0]
    raw_title = doc.select_one("head title").text
    module_title = raw_title.replace('OLCreate:', '')\
            .replace('TESSA_ARABIC', '')\
            .replace('TESSA_Eng', '')\
            .replace('TESSA_Fr', '')\
            .strip()
    module_contents_dict = dict(
        kind='TessaModuleContentsDict',
        lang=lang,
        source_id=source_id,
        title=module_title,
        children=[],
    )

    # TRY TO CREATE MODULE TOC SIDEBAR MENU
    ############################################################################
    current_li_deep = doc.find('li', class_='oucontent-tree-current')

    # Sept 5th: special treatement for modules with no TOC in sidebar
    if current_li_deep is None:
        return download_module_no_toc(module_url, lang=lang)

    # CREATE MODULE TOC SIDEBAR MENU
    # July 28 HACK : infer module_toc_li  using marker on sublist-li
    ############################################################################
    destination = tempfile.mkdtemp()
    print('destination=', destination)
    # copy css/js/images from skel
    shutil.copytree('chefdata/templates/module_skel/styles',
                    os.path.join(destination, 'styles'))

    is_first_section = True
    module_toc_li = current_li_deep.find_parent('li', class_='item-section')
    # print(module_toc_li.prettify())
    # module_contents_div = module_toc_li.find('div', class_='oucontent-contents')
    outer_module_ul = module_toc_li.find('ul',
                                         class_='child-item-list',
                                         recursive=False)
    inner_module_ul = outer_module_ul.find(
        'div', class_='oucontent-contents').find('ul', recursive=False)
    section_lis = inner_module_ul.find_all('li', recursive=False)
    print(len(section_lis))

    # DETECT IF SIMPLE MODULE (single page, so sections) OR COMPLEX MODULE (with sections)
    if len(section_lis) == 0:
        print('UNEXPECTED --------  len(section_lis) == 0')
        print(module_url, '<<< <<< ' * 6)
    if len(section_lis) == 1:
        is_simple_module = True
    else:
        is_simple_module = False

    # SIMPLE MODULES THAT CONSIST OF A SINGLE PAGE -- becomes index.html
    if is_simple_module:
        section_li = section_lis[0]
        # print('*'*120)
        # print(section_li.prettify())
        section_title_span = section_li.find('span',
                                             class_='oucontent-tree-item')
        section_title = get_text(section_title_span)
        print('Processing simple module:', section_title)
        section_dict = dict(
            kind='TessaModuleContentsSection',
            title=section_title,
            href=module_url,
            filename='index.html',  # TODO: figure out if this is necessary
            children=[],
        )
        # print('  section:', section_title)
        module_contents_dict['children'].append(section_dict)

        subsections_ul = section_li.find('ul', recursive=False)
        if subsections_ul:
            pass
            #print('found some subsections...')
        else:
            pass
            #print('no subsections <ul> found in this section')

        download_page(module_url, destination, 'index.html', lang)
    # /SIMPLE MODULE

    # COMPLEX MODULES WITH SECTIONS AND custom-made TOC in index.html
    else:
        for section_li in section_lis:

            if 'download individual sections' in get_text(
                    section_li):  # TODO: AR, SW, FR
                print(
                    'skipping section "Read or download individual sections..." '
                )
                continue

            # print(section_li.prettify())
            # print('>'*80)
            section_title_span = section_li.find('span',
                                                 class_='oucontent-tree-item')
            if section_title_span:
                if section_title_span.find('span', class_='current-title'):
                    section_href = module_url
                else:
                    section_a = section_title_span.find('a')
                    if section_a:
                        section_href = section_a['href']
                    else:
                        section_href = '#NOLINK'  # for sections like "Top 20 ideas for teaching large classes"
            else:
                section_href = '#NOLINK'  # for sections like "Read or download individual sections of the m..."

            # special case for first section --- since it doesn't save section in filename
            # manually call download_page with filename section_1.html with contents of current page
            if is_first_section:
                section_filename = 'section-1.html'
                is_first_section = False
            else:
                if '#NOLINK' not in section_href:
                    section_filename = get_section_filename(section_href)

            # accesshide_span = section_title_span.find('span', class_='accesshide')
            # if accesshide_span:
            #     accesshide_span.extract()
            # subsections_ul.extract()
            section_title = get_text(section_title_span)

            section_dict = dict(
                kind='TessaModuleContentsSection',
                title=section_title,
                href=section_href,
                filename=section_filename,
                children=[],
            )
            # print('  section:', section_title)
            module_contents_dict['children'].append(section_dict)

            subsections_ul = section_li.find('ul', recursive=False)
            if subsections_ul:
                subsection_lis = subsections_ul.find_all('li')
                for subsection_li in subsection_lis:
                    # print('<'*100)
                    # print(subsection_li)
                    #print('>>>>>')
                    #print(subsection_li.prettify())
                    subsection_link = subsection_li.find('a')
                    if not subsection_link:  # handle wrird
                        LOGGER.warning('(((((  Skipping section ' +
                                       subsection_li.get_text() +
                                       ' because no subsection_link')
                        continue
                    subsection_href = subsection_link['href']
                    subsection_filename = get_section_filename(subsection_href)
                    # subaccesshide_span = subsection_li.find('span', class_='accesshide')
                    # if subaccesshide_span:
                    #     subaccesshide_span.extract()
                    subsection_title = get_text(subsection_li)
                    subsection_dict = dict(
                        kind='TessaModuleContentsSubsection',
                        title=subsection_title,
                        href=subsection_href,
                        filename=subsection_filename,
                    )
                    # print('    subsection:', subsection_title)
                    section_dict['children'].append(subsection_dict)
            else:
                print('no subsections <ul> found in this section')

        module_index_tmpl = jinja2.Template(
            open('chefdata/templates/module_index.html').read())
        index_contents = module_index_tmpl.render(module=module_contents_dict)
        with open(os.path.join(destination, "index.html"), "w") as f:
            f.write(index_contents)

        # download the html content from each section/subsection
        for section in module_contents_dict['children']:
            if '#NOLINK' in section['href']:
                print('nothing to download for #NOLINK section')
                continue
            download_section(section['href'], destination, section['filename'],
                             lang)
            for subsection in section['children']:
                if '#NOLINK' in subsection['href']:
                    print('nothing to download for #NOLINK subsection')
                    continue
                download_section(subsection['href'], destination,
                                 subsection['filename'], lang)
        # /COMPLEX MODULE

    zip_path = create_predictable_zip(destination)
    return zip_path
Ejemplo n.º 16
0
    def modify_zip(self, scorm_zip):
        """
        The SCORM modules we receive in some cases have graphics that reference UI elements that don't exist in
        Kolibri. This function modifies the zip to remove them and returns the modified zip.
        :param scorm_zip: The path to the original zip file.
        :return: Path to the modified zip file, if it exists.
        """
        zip_dir_name = os.path.splitext(os.path.basename(scorm_zip))[0]
        zip_root = os.path.join(self.temp_dir, zip_dir_name)
        output_zip = os.path.join(self.temp_dir, 'out_zips', zip_dir_name)

        os.makedirs(zip_root, exist_ok=True)
        os.makedirs(os.path.dirname(output_zip), exist_ok=True)

        zip = zipfile.ZipFile(scorm_zip)
        zip.extractall(zip_root)

        zip_changed = False
        telas_end_sprites = os.path.join(zip_root, 'curso', 'telas', 'end', 'sprites.png')
        if os.path.exists(telas_end_sprites):
            LOGGER.debug("Deleting sprites at {}".format(telas_end_sprites))
            os.remove(telas_end_sprites)
            zip_changed = True
        else:
            assert "n1_ted_len_en_u01_v02" not in scorm_zip, os.listdir(zip_root)

        for replace_img in self.replace_images:
            img_glob = glob.glob(os.path.join(zip_root, '**', replace_img), recursive=True)
            for img in img_glob:
                os.remove(img)
                shutil.copy(os.path.join(ROOT_DIR, 'assets', replace_img), img)
                if not replace_img in self.replaced_images:
                    self.replaced_images.append(replace_img)

                zip_changed = True

        # make any HTML replacements
        replaced_imgs = []
        for html_file in glob.glob(os.path.join(zip_root, '**', '*.html'), recursive=True):
            soup = BeautifulSoup(open(html_file, 'rb').read(), parser='html.parser')

            for img in self.remove_imgs:
                img_tag = soup.find('img', src = re.compile('{}$'.format(img)))
                if img_tag:
                    if not img in self.removed_imgs:
                        self.removed_imgs.append(img)
                    replaced_imgs.append(img)
                    img_tag.extract()
                    f = open(html_file, 'wb')
                    f.write(soup.prettify('utf-8'))
                    f.close()
                    zip_changed = True
                    break
                else:
                    assert img not in soup.prettify(), "Problem replacing image {} in {}".format(img, scorm_zip)

        if 'n2_tek_en_lan_u09' in scorm_zip:
            assert zip_changed, "Narrative SCORM module had no changes."
            assert 'kap_cerrar.png' in replaced_imgs, "Replaced images = {}".format(replaced_imgs)
            assert 'kap_cerrar.png' in self.removed_imgs, "Removed images = {}".format(self.removed_imgs)

        if zip_changed:
            temp_zip = create_predictable_zip(zip_root)
            scorm_zip = output_zip + '.zip'
            os.rename(temp_zip, scorm_zip)

        return scorm_zip
    def convert_ka_node_to_ricecooker_node(self, ka_node, target_lang=None):
        """
        Convert a KA node (a subclass of `KhanNode`) to a ricecooker node (dict).
        Returns None if node slug is blacklisted or inadmissable for inclusion
        due to another reason (e.g. undtranslated video and no subs available).
        """

        if ka_node.slug in self.slug_blacklist:
            return None

        if isinstance(ka_node, KhanTopic):
            LOGGER.debug('Converting ka_node ' + ka_node.slug +
                         ' to ricecooker json')
            topic = dict(
                kind=content_kinds.TOPIC,
                source_id=ka_node.id,
                title=ka_node.title,
                description=ka_node.description[:400]
                if ka_node.description else '',
                slug=ka_node.slug,
                children=[],
            )
            for ka_node_child in ka_node.children:
                if isinstance(
                        ka_node_child, KhanTopic
                ) and ka_node_child.slug in self.topic_replacements:
                    # This topic must be replaced by a list of other topic nodes
                    replacements = self.topic_replacements[ka_node_child.slug]
                    LOGGER.debug('Replacing ka_node ' + ka_node.slug +
                                 ' with replacements=' + str(replacements))
                    for r in replacements:
                        rtopic = dict(
                            kind=content_kinds.TOPIC,
                            source_id=r['slug'],
                            title=r['translatedTitle'],  # guaranteed to exist
                            description=r.get('description'),  # (optional)
                            slug=r['slug'],
                            children=[],
                        )
                        topic["children"].append(rtopic)
                        LOGGER.debug('  >>> rtopic = ' + rtopic["slug"])
                        for rchild in r['children']:  # guaranteed to exist
                            LOGGER.debug('      >>>> rchild["slug"] = ' +
                                         rchild["slug"])
                            if 'children' not in rchild:
                                # CASE A: two-level replacement hierarchy
                                rchild_ka_node = self.topics_by_slug.get(
                                    rchild['slug'])
                                if rchild_ka_node:
                                    if 'translatedTitle' in rchild:
                                        rchild_ka_node.title = rchild[
                                            'translatedTitle']
                                    rchildtopic = self.convert_ka_node_to_ricecooker_node(
                                        rchild_ka_node,
                                        target_lang=target_lang)
                                    if rchildtopic:
                                        rtopic["children"].append(rchildtopic)
                                else:
                                    LOGGER.warning(
                                        'Failed to find rchild slug=' +
                                        rchild['slug'])
                            else:
                                # CASE B: three-level replacement hierarchy
                                rchildtopic = dict(
                                    kind=content_kinds.TOPIC,
                                    source_id=rchild['slug'],
                                    title=rchild[
                                        'translatedTitle'],  # guaranteed to exist
                                    description=rchild.get(
                                        'description'),  # (optional)
                                    slug=rchild['slug'],
                                    children=[],
                                )
                                rtopic["children"].append(rchildtopic)
                                for rgrandchild in rchild['children']:
                                    rgrandchild_slug = rgrandchild['slug']
                                    LOGGER.debug(
                                        '               >>> rgrandchild_slug = '
                                        + rgrandchild_slug)
                                    rgrandchild_ka_node = self.topics_by_slug.get(
                                        rgrandchild_slug)
                                    if rgrandchild_ka_node:
                                        if 'translatedTitle' in rgrandchild:
                                            rgrandchild_ka_node = rgrandchild[
                                                'translatedTitle']
                                        rgrandchildtopic = self.convert_ka_node_to_ricecooker_node(
                                            rgrandchild_ka_node,
                                            target_lang=target_lang)
                                        if rgrandchildtopic:
                                            rchildtopic["children"].append(
                                                rgrandchildtopic)
                                    else:
                                        LOGGER.warning(
                                            'Failed to find rgrandchild slug='
                                            + rgrandchild_slug)
                else:
                    # This is the more common case (no replacement), just add...
                    child = self.convert_ka_node_to_ricecooker_node(
                        ka_node_child,
                        target_lang=target_lang,
                    )
                    if child:
                        topic["children"].append(child)
            # Skip empty topics
            if topic["children"]:
                return topic
            else:
                return None

        elif isinstance(ka_node, KhanExercise):
            if ka_node.mastery_model in EXERCISE_MAPPING:
                mastery_model = EXERCISE_MAPPING[ka_node.mastery_model]
            else:
                LOGGER.warning(
                    "Unknown mastery model ({}) for exercise with id: {}".
                    format(ka_node.mastery_model, ka_node.id))
                mastery_model = exercises.M_OF_N

            # common core tags
            tags = []
            if ka_node.slug in CC_MAPPING:
                tags.append(CC_MAPPING[ka_node.slug])

            exercise = dict(
                kind=content_kinds.EXERCISE,
                source_id=ka_node.id,
                title=ka_node.title,
                description=ka_node.description[:400]
                if ka_node.description else '',
                exercise_data=mastery_model,
                license=dict(
                    license_id=licenses.SPECIAL_PERMISSIONS,
                    copyright_holder="Khan Academy",
                    description=
                    "Permission granted to distribute through Kolibri for non-commercial use",
                ),  # need to formalize with KA
                thumbnail=ka_node.thumbnail,
                slug=ka_node.slug,
                questions=[],
                tags=tags,
            )
            for ka_assessment_item in ka_node.get_assessment_items():
                if ka_assessment_item.data and ka_assessment_item.data != "null":
                    assessment_item = dict(
                        question_type=exercises.PERSEUS_QUESTION,
                        id=ka_assessment_item.id,
                        item_data=ka_assessment_item.data,
                        source_url=ka_assessment_item.source_url,
                    )
                    exercise["questions"].append(assessment_item)
            # if there are no questions for this exercise, return None
            if not exercise["questions"]:
                return None
            return exercise

        elif isinstance(ka_node, KhanVideo):
            le_target_lang = target_lang
            DUBBED_VIDEOS = DUBBED_VIDEOS_BY_LANG.get(le_target_lang, [])
            target_lang = VIDEO_LANGUAGE_MAPPING.get(target_lang, target_lang)

            if ka_node.youtube_id != ka_node.translated_youtube_id:
                if ka_node.lang != target_lang.lower():
                    LOGGER.info(
                        "Node with youtube id: {} and translated id: {} has wrong language"
                        .format(ka_node.youtube_id,
                                ka_node.translated_youtube_id))
                    return None

            files = [
                dict(
                    file_type="video",
                    youtube_id=ka_node.translated_youtube_id,
                    high_resolution=False,
                    download_settings={
                        'postprocessors': [{
                            'key':
                            'ExecAfterDownload',
                            'exec_cmd':
                            'ffmpeg -hide_banner -loglevel panic -i {} -b:a 32k -ac 1 {}_tmp.mp4 && mv {}_tmp.mp4 {}',
                        }]
                    })
            ]

            # Find all subtitles that are available for this video
            subtitle_languages = get_subtitle_languages(
                ka_node.translated_youtube_id)

            # if we dont have video in target lang or subtitle not available in target lang, return None
            if ka_node.lang != target_lang.lower():
                if ka_node.translated_youtube_id in DUBBED_VIDEOS:
                    pass  # videos known to be transalted and should be included
                elif not any(
                        should_include_subtitle(sub_code, le_target_lang)
                        for sub_code in subtitle_languages):
                    LOGGER.error(
                        "Untranslated video {} and no subs available. Skipping."
                        .format(ka_node.translated_youtube_id))
                    return None

            for lang_code in subtitle_languages:
                if is_youtube_subtitle_file_supported_language(lang_code):
                    if target_lang == "en":
                        # KA English is special: use subs for all available langs
                        files.append(
                            dict(
                                file_type="subtitles",
                                youtube_id=ka_node.translated_youtube_id,
                                language=lang_code,
                            ))
                    elif should_include_subtitle(lang_code, le_target_lang):
                        files.append(
                            dict(
                                file_type="subtitles",
                                youtube_id=ka_node.translated_youtube_id,
                                language=lang_code,
                            ))
                    else:
                        LOGGER.debug(
                            'Skipping subs with lang_code {} for video {}'.
                            format(lang_code, ka_node.translated_youtube_id))

            # convert KA's license format into our internal license classes
            if ka_node.license in LICENSE_MAPPING:
                license = LICENSE_MAPPING[ka_node.license]
            else:
                # license = licenses.CC_BY_NC_SA # or?
                LOGGER.error(
                    "Unknown license ({}) on video with youtube id: {}".format(
                        ka_node.license, ka_node.translated_youtube_id))
                return None

            video = dict(
                kind=content_kinds.VIDEO,
                # POLICY: set the `source_id` based on the `youtube_id` of the
                # original English video and not the `translated_youtube_id`:
                source_id=ka_node.youtube_id,
                title=ka_node.title,
                description=ka_node.description[:400]
                if ka_node.description else '',
                license=license,
                thumbnail=ka_node.thumbnail,
                files=files,
            )

            return video

        elif isinstance(ka_node, KhanArticle):
            # TODO
            return None
    def on_fun_page(self, url, page, context):
        """
        This handles pages of the form gamelist/CRS??? and hn/Fun that contain
        direct links to resources without the topics and subtopic hierarchy.
        """
        LOGGER.debug('     in on_fun_page' + url)
        page_dict = dict(
            kind='fun_page',
            url=url,
            children=[],
        )
        page_dict.update(context)
        context['parent']['children'].append(page_dict)

        try:
            body_row = page.find('div', {'id': 'body-row'})
            contents_row = body_row.find('div', {'class': 'row'})
        except Exception as e:
            LOGGER.error('ERROR on_fun_page: %s : %s' % (e, url))
            return
        contents = contents_row.find_all('div', {'class': 'col-md-3'})

        for content in contents:
            try:
                title = get_text(content.find('div', {'class': 'txtline'}))
                # TODO: description
                thumbnail = content.find('a').find('img')['src']
                thumbnail = get_absolute_path(thumbnail)

                # get_fun_content_link
                link = content.find('a')
                source_id = link['href'][1:]
                fun_resource_url = get_absolute_path(link['href'])
                # direct_download_url = None
                direct_download_link = content.find('a', class_='dnlinkfunstory')
                if direct_download_link:
                    direct_download_href = direct_download_link['href'].strip()
                    # direct_download_url = get_absolute_path(direct_download_href)

                # Need to GET the FunResource detail page since main_file is not in avail. in listing
                fun_rsrc_html = requests.get(fun_resource_url).text
                respath_url = get_respath_url_from_html(fun_rsrc_html)
                fun_doc = BeautifulSoup(fun_rsrc_html, "html.parser")
                download_url = get_download_url_from_doc(url, fun_doc)
                respath_path = urlparse(respath_url).path

                if self.should_ignore_url(respath_url):
                    print('ignoring fun content', title, respath_url)
                    continue

                LOGGER.debug('      Fun content: %s: %s at %s' % (source_id, title, respath_url))

                if respath_path.endswith('mp4') or respath_path.endswith('MP4') or respath_path.endswith('m4v'):
                    video = dict(
                        url=respath_url,
                        kind='PrathamVideoResource',
                        title=title,
                        description='source_url=' + respath_url if DEBUG_MODE else '',
                        source_id=source_id,
                        thumbnail_url=thumbnail,
                        children=[],
                    )
                    video.update(self.get_video_metadata(respath_url))
                    page_dict['children'].append(video)

                elif respath_path.endswith('pdf'):
                    pdf = dict(
                        url=respath_url,
                        kind='PrathamPdfResource',
                        description='source_url=' + respath_url if DEBUG_MODE else '',
                        title=title,
                        source_id=source_id,
                        thumbnail_url=thumbnail,
                        children=[],
                    )
                    page_dict['children'].append(pdf)

                elif download_url and download_url.endswith('zip'):
                    if '.~' in download_url:
                        # Fix broken links of the form https://www.prathamopenschool.org/Gj/gamelist/CRS174/.~/CourseContent/Games/NumberKas_GJ.zip
                        pathels = download_url.split('/')
                        download_url = '/'.join(pathels[0:3] + pathels[7:])
                    zipfile = dict(
                        url=download_url,
                        kind='PrathamZipResource',
                        title=title,
                        description='source_url=' + download_url if DEBUG_MODE else '',
                        source_id=source_id,
                        thumbnail_url=thumbnail,
                        main_file=respath_url,   # needed to rename to index.html if different
                        children=[],
                    )
                    page_dict['children'].append(zipfile)


                elif respath_path.endswith('html'):
                    download_url = respath_url.replace('/index.html', '.zip')
                    html_rsrc = dict(
                        url=download_url,
                        kind='PrathamZipResource', # used to be OtherPrathamHtmlResource
                        title=title,
                        description='source_url=' + download_url if DEBUG_MODE else '',
                        source_id=source_id,
                        thumbnail_url=thumbnail,
                        main_file=respath_url,
                        children=[],
                    )
                    page_dict['children'].append(html_rsrc)

                else:
                    LOGGER.error('ZZZZ>>> Fun resource not supported: onpage=%s  respath_path=%s download_url=%s' % (url, respath_path, download_url))
                    unsupported_rsrc = dict(
                        url=respath_url,
                        referring_url=url,
                        kind='UnsupportedPrathamWebResource',
                        title=title,
                        source_id=source_id,
                        thumbnail_url=thumbnail,
                        children=[],
                    )
                    page_dict['children'].append(unsupported_rsrc)

            except Exception as e:
                LOGGER.error('on_fun_page: %s : %s' % (e, content))
def content_node_from_entry(entry, lang_code):
    """
    Convert a feed entry into ricecooker json dict.
    """
    # METADATA
    ############################################################################
    # author (using ,-separated list in case of multiple authors/contributors)
    authors_str = _author_from_entry(entry)

    # license info
    # currently one of {'African Storybook Initiative', 'USAID'}
    dcterms_publisher = entry['dcterms_publisher']
    license_id = guess_license_id_from_string(entry['dcterms_license'])
    LICENSE = get_license(license_id,
                          copyright_holder=dcterms_publisher).as_dict()

    # currently one of {'African Storybook Initiative', 'USAID'}
    provider = dcterms_publisher

    # since we're importing the content from here
    aggregator = 'Global Digital Library'

    # CONTENT
    ############################################################################
    pdf_link = None
    epub_link = None
    thumbnail_url = None
    for link in entry.links:
        if link['type'] == 'application/pdf':
            pdf_link = link
        elif link['type'] == 'application/epub+zip':
            epub_link = link
        elif link['rel'] == _REL_OPDS_IMAGE:
            thumbnail_url = link['href']
        elif link['rel'] == _REL_OPDS_THUMBNAIL:
            pass  # skip thumnail URLs silently --- prefer _REL_OPDS_IMAGE because has right extension
        else:
            print('Skipping link', link)
            pass

    # prefer EPUBs...
    if epub_link:
        epub_url = epub_link['href']
        child_node = dict(
            kind=content_kinds.DOCUMENT,
            source_id=entry['id'],
            language=lang_code,
            title=entry['title'],
            description=entry.get('summary', None),
            author=authors_str,
            license=LICENSE,
            provider=provider,
            aggregator=aggregator,
            thumbnail=thumbnail_url,
            files=[],
        )
        epub_file = dict(
            file_type=file_types.EPUB,
            path=epub_url,
            language=lang_code,
        )
        child_node['files'] = [epub_file]
        LOGGER.debug('Created EPUB Document Node from url ' + epub_url)
        return child_node

    # ... but if no EPUB, then get PDF.
    elif epub_link is None and pdf_link:
        pdf_url = pdf_link['href']
        child_node = dict(
            kind=content_kinds.DOCUMENT,
            source_id=entry['id'],
            language=lang_code,
            title=entry['title'],
            description=entry.get('summary', None),
            author=authors_str,
            license=LICENSE,
            provider=provider,
            aggregator=aggregator,
            thumbnail=thumbnail_url,
            files=[],
        )

        if dcterms_publisher in BOOK_PUBLISHERS_TO_CROP:  # crop African Storybook PDFs
            pdf_path = crop_pdf_from_url(pdf_url)
        else:
            pdf_path = pdf_url  # upload unmodified PDF

        pdf_file = dict(
            file_type=file_types.DOCUMENT,
            path=pdf_path,
            language=lang_code,
        )
        child_node['files'] = [pdf_file]
        LOGGER.debug('Created PDF Document Node from url ' + pdf_url)
        return child_node

    else:
        print('***** Skipping content, because no supported formats found',
              entry)
        return None