def download_wikipedia_page(url, thumbnail, title):
    """ Create zip file to use for html pages """
    destpath = tempfile.mkdtemp(
    )  # Create a temp directory to house our downloaded files

    # downlod the main wikipedia page, apply a middleware processor, and call it index.html
    localref, _ = download_file(
        url,
        destpath,
        filename="index.html",
        middleware_callbacks=process_wikipedia_page,
    )

    zippath = create_predictable_zip(
        destpath)  # Turn the temp folder into a zip file

    # Create an HTML5 app node
    html5app = nodes.HTML5AppNode(
        files=[files.HTMLZipFile(zippath)],
        title=title,
        thumbnail=thumbnail,
        source_id=url.split("/")[-1],
        license=CHANNEL_LICENSE,
    )

    return html5app
Ejemplo n.º 2
0
def download_wikipedia_page(url, thumbnail, title):
    # create a temp directory to house our downloaded files
    destpath = tempfile.mkdtemp()

    # downlod the main wikipedia page, apply a middleware processor, and call it index.html
    localref, _ = download_file(
        url,
        destpath,
        filename="index.html",
        middleware_callbacks=process_wikipedia_page,
        request_fn=make_request,
    )

    # turn the temp folder into a zip file
    zippath = create_predictable_zip(destpath)

    # create an HTML5 app node
    html5app = HTML5AppNode(
        files=[HTMLZipFile(zippath)],
        title=title,
        thumbnail=thumbnail,
        source_id=url.split("/")[-1],
        license=licenses.PublicDomainLicense(),
    )

    return html5app
def transform_html(content):
    """
    Transform the HTML markup taken from `content` (str) to file index.html in
    a standalone zip file. Return the neceesary metadata as a dict.
    """
    chef_tmp_dir = 'chefdata/tmp'
    webroot = tempfile.mkdtemp(dir=chef_tmp_dir)

    metadata = dict(
        kind='html_content',
        source_id=content[0:30],
        zippath=None,  # to be set below
    )

    doc = BeautifulSoup(content, 'html5lib')
    meta = Tag(name='meta', attrs={'charset': 'utf-8'})
    doc.head.append(meta)
    # TODO: add meta language (in case of right-to-left languages)

    # Writeout new index.html
    indexhtmlpath = os.path.join(webroot, 'index.html')
    with open(indexhtmlpath, 'w') as indexfilewrite:
        indexfilewrite.write(str(doc))

    # Zip it
    localize_image_refs(webroot)
    zippath = create_predictable_zip(webroot)
    metadata['zippath'] = zippath

    return metadata
def transform_hpstoryline_folder(contentdir, story_id, node):
    """
    Package the contents of the folder of kind `hpstoryline` called `story_id`
    located in the directory `contentdir` and return the neceesary metadata as a dict.
    """
    sourcedir = os.path.join(contentdir, story_id)
    webroot = os.path.join(contentdir,
                           story_id + '_webroot')  # transformed dir

    if not os.path.exists(sourcedir):
        print('WWW Could not find local resource folder for story_id=',
              story_id)
        return None

    if os.path.exists(webroot):
        shutil.rmtree(webroot)

    # Copy source dir to webroot dir where we'll do the edits and transformations
    shutil.copytree(sourcedir, webroot)
    metadata = dict(
        kind='hpstoryline',
        title_en=node['title'],
        source_id=story_id,
        thumbnail=None,  # TODO
        zippath=None,  # will be set below
    )

    # Zip it
    localize_image_refs(webroot)
    zippath = create_predictable_zip(webroot)
    metadata['zippath'] = zippath

    return metadata
def download_puzzle(puzzle_url, title, description, thumbnail,
                    le_language_code, blockly_language_code):
    """Download a single puzzle and return an HTML5 app node."""
    with WebDriver("https://blockly-games.appspot.com/%s" % puzzle_url,
                   delay=1000) as driver:
        doc = BeautifulSoup(driver.page_source, "html.parser")

    # Create a temporary folder to download all the files for a puzzle.
    destination = tempfile.mkdtemp()

    # Download all the JS/CSS/images/audio/etc we can get from scraping the
    # page source.
    doc = download_static_assets(doc,
                                 destination,
                                 'https://blockly-games.appspot.com',
                                 request_fn=make_request,
                                 url_blacklist=['analytics.js'])

    # Download other files not picked up by the above generic assets fetching,
    # e.g. from GitHub.
    puzzle_name = puzzle_url.split('?')[0]
    download_additional_assets(destination, puzzle_name)

    # Make some modifications to the HTML source -- hide some elements.
    remove_node(doc, '#languageMenu')
    remove_node(doc, '#title')

    # Copy over some of our own JS/CSS files and then add links to them in the
    # page source.
    copy_tree("static", os.path.join(destination, "static"))

    chef_body_script = doc.new_tag("script", src="static/chef_end_of_body.js")
    doc.select_one('body').append(chef_body_script)

    chef_head_script = doc.new_tag("script")
    chef_head_script.string = 'window["BlocklyGamesLang"] = "%s";' % blockly_language_code
    doc.select_one('head').insert(0, chef_head_script)

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print(
        "    Downloaded puzzle %s titled \"%s\" (thumbnail %s) to destination %s"
        % (puzzle_url, title, thumbnail, destination))
    # preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id=puzzle_url,
        title=truncate_metadata(title),
        description=description,
        license=licenses.PublicDomainLicense(copyright_holder='Google'),
        thumbnail=thumbnail,
        files=[files.HTMLZipFile(zip_path)],
        language=le_language_code,
    )
Ejemplo n.º 6
0
def download_writing_topic_category(category_doc, title, level_id):
    destination = tempfile.mkdtemp()

    # Download a font
    font_url = make_fully_qualified_url(
        '//fonts.googleapis.com/css?family=Roboto:400,300,300italic,400italic,700,700italic'
    )
    download_file(font_url,
                  destination,
                  request_fn=make_request,
                  filename='roboto.css')

    # Write out the HTML source, based on CSS formatting from
    # https://k12.thoughtfullearning.com/resources/writingtopics

    topics = (("<li>%s</li>" % topic.text)
              for topic in category_doc.select('.views-row'))
    html_source = """
        <!DOCTYPE html>
        <head>
            <link href='roboto.css' rel='stylesheet' type='text/css'>
            <style>
                ul {
                    margin: 0 0 0 40px;
                    padding: 0;
                }
                li {
                    font-family: "Roboto", sans-serif;
                    font-weight: 300;
                    font-size: 19.2px;
                    line-height: 24.96px;
                    color: #202020;
                    margin-top: 10px;
                }
            </style>
        </head>
        <body>
            <ul>%s</ul>
        </body>
    """ % ''.join(topics)

    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(html_source)

    print("    ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id="%s|%s" % (level_id, title),
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(
            copyright_holder=truncate_metadata('Thoughtful Learning')),
        files=[files.HTMLZipFile(zip_path)],
        language="en",
        thumbnail=writing_topic_thumbnail,
    )
Ejemplo n.º 7
0
    def construct_channel(self, *args, **kwargs):
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        lang_names = list(self.data.keys())
        lang_names.sort()

        for lang_name in lang_names:
            lang_data = self.data[lang_name]
            LOGGER.info("Creating app for language: {}".format(lang_name))
            lang = languages.getlang_by_native_name(lang_name)

            zip_dir = self.client.create_zip_dir_for_page(lang_data['url'])

            soup = self.client.get_page_soup(lang_data['url'])

            # Remove the translation list if found
            translations = soup.find('div', {'id': 'translations'})
            if translations:
                translations.extract()

            # Grab the localized title
            title = soup.find('span', {'id': 'share_title'}).text

            # Save the modified index.html page
            thumbnail = None
            for resource in lang_data['resources']:
                if 'dp3t.png' in resource:
                    thumbnail = os.path.join(zip_dir, resource)
                    break

            with open(os.path.join(zip_dir, 'index.html'), 'wb') as f:
                f.write(soup.prettify(encoding='utf-8'))

            # create_predictable_zip ensures that the ZIP file does not change each time it's created. This
            # ensures that the zip doesn't get re-uploaded just because zip metadata changed.
            zip_file = zip.create_predictable_zip(zip_dir)
            zip_name = lang.primary_code if lang else lang_name
            zip_filename = os.path.join(self.ZIP_DIR,
                                        "{}.zip".format(zip_name))
            os.makedirs(os.path.dirname(zip_filename), exist_ok=True)
            os.rename(zip_file, zip_filename)

            topic = nodes.TopicNode(source_id=lang_name, title=lang_name)
            zip_node = nodes.HTML5AppNode(
                source_id="covid19-sim-{}".format(lang_name),
                title=title,
                files=[files.HTMLZipFile(zip_filename)],
                license=licenses.PublicDomainLicense(
                    "Marcel Salathé & Nicky Case"),
                language=lang,
                thumbnail=thumbnail)
            topic.add_child(zip_node)
            channel.add_child(topic)

        return channel
    def _scrape_story_html5(self, story):
        url = story['url']
        page = self._html.get(url)
        story_section = page.find('section', id='section-main')
        links_section = story_section.find('div', class_='languages-links')

        # Is there a way to cross link HTML5AppNode?
        if links_section:
            links_section.extract()

        title = self.__get_text(story_section.find('h1', class_='page-header'))
        language_code = self.__get_language_code(story['language'])
        dest_path = tempfile.mkdtemp(dir=NalibaliChef.ZIP_FILES_TMP_DIR)

        for img in story_section.find_all('img'):
            self._scrape_download_image(dest_path, img)

        basic_page_str = """
        <!DOCTYPE html>
        <html>
          <head>
            <meta charset="utf-8">
            <title></title>
          </head>
          <body>
          </body>
        </html>"""
        basic_page = BeautifulSoup(basic_page_str, "html.parser")
        body = basic_page.find('body')
        body.append(story_section)
        with open(os.path.join(dest_path, 'index.html'), 'w',
                  encoding="utf8") as index_html:
            index_html.write(str(basic_page))
        zip_path = create_predictable_zip(dest_path)
        parsed_story_url = urlparse(url)
        return dict(
            kind=content_kinds.HTML5,
            source_id=parsed_story_url.path if parsed_story_url else url,
            title=title,
            language=language_code,
            description=story['description'],
            license=NalibaliChef.LICENSE,
            thumbnail=story['thumbnail'],
            files=[
                dict(
                    file_type=content_kinds.HTML5,
                    path=zip_path,
                    language=language_code,
                )
            ],
        )
Ejemplo n.º 9
0
    def create_zip_from_dir(self, dir_to_zip):
        """
        Adds all the files and subfolders from dir_to_zip into a Kolibri-compatible zip file.

        :param dir_to_zip: Directory containing files to zip.
        :return: Path to zip file. Note that this file is stored in the temp dir and will not persist across runs.
        """
        temp_zip = zip.create_predictable_zip(dir_to_zip)
        zip_hash = files.get_hash(temp_zip)
        zip_dir = os.path.join(self.cache_dir, 'zips')
        if not os.path.exists(zip_dir):
            os.makedirs(zip_dir)
        output_zip = os.path.join(zip_dir, '{}.zip'.format(zip_hash))
        os.rename(temp_zip, output_zip)
        return output_zip
Ejemplo n.º 10
0
def test_create_many_predictable_zip_files(ndirs=8193):
    """
    Regression test for `OSError: [Errno 24] Too many open files` when using
    ricecooker.utils.zip.create_predictable_zip helper method:
    https://github.com/learningequality/ricecooker/issues/185
    Run `ulimit -a` to see the limits for # open files on your system and set ndirs
    to higher number to use this test. Also comment out the @pytest.mark.skip
    """
    zip_paths = []
    for _ in range(0, ndirs):
        inputdir = tempfile.mkdtemp()
        with open(os.path.join(inputdir, 'index.html'), 'w') as testf:
            testf.write('something something')
        zip_path = create_predictable_zip(inputdir)
        zip_paths.append(zip_path)
    assert len(zip_paths) == ndirs, 'wrong number of zip files created'
Ejemplo n.º 11
0
def create_html5_app_node(license,
                          content_dict,
                          ims_dir,
                          scraper_class=None,
                          temp_dir=None,
                          needs_scorm_support=False):
    if scraper_class:
        index_path = os.path.join(ims_dir, content_dict['index_file'])

        if '?' in index_path:
            index_path = index_path.split('?')[0]
        if '#' in index_path:
            index_path = index_path.split('#')[0]
        if content_dict['scormtype'] == 'sco' and needs_scorm_support:
            add_scorm_support(index_path, ims_dir)

        index_uri = pathlib.Path(os.path.abspath(index_path)).as_uri()
        zip_name = '%s.zip' % hashlib.md5(
            index_uri.encode('utf-8')).hexdigest()
        temp_dir = temp_dir if temp_dir else tempfile.gettempdir()
        zip_path = os.path.join(temp_dir, zip_name)
        scraper = scraper_class(index_uri)
        scraper.download_file(zip_path)
        logging.info('Webmixer scraper outputted HTML app to %s' % zip_path)

    else:
        with tempfile.TemporaryDirectory() as destination:
            index_src_path = os.path.join(ims_dir, content_dict['index_file'])
            index_dest_path = os.path.join(destination, 'index.html')
            shutil.copyfile(index_src_path, index_dest_path)

            for file_path in content_dict['files']:
                shutil.copy(os.path.join(ims_dir, file_path), destination)

            if content_dict.get('scormtype') == 'sco' and needs_scorm_support:
                add_scorm_support(index_dest_path, destination)

            #preview_in_browser(destination)
            zip_path = create_predictable_zip(destination)

    return nodes.HTML5AppNode(
        source_id=content_dict['identifier'],
        title=content_dict.get('title'),
        license=license,
        files=[files.HTMLZipFile(zip_path)],
    )
Ejemplo n.º 12
0
def create_html5_app_node(license, content_dict):
    with tempfile.TemporaryDirectory() as destination:
        index_copy_path = os.path.join(destination, 'index.html')
        shutil.copyfile(content_dict['index_file'], index_copy_path)

        for file_path in content_dict['files']:
            shutil.copy(file_path, destination)

        #preview_in_browser(destination)

        zip_path = create_predictable_zip(destination)
        return nodes.HTML5AppNode(
            source_id=content_dict['identifier'],
            title=content_dict.get('title'),
            license=license,
            files=[files.HTMLZipFile(zip_path)],
        )
def get_phet_zip_file(zip_file_url, main_file_and_query):
    """
    Phet simulations are provided in the zip file `phet.zip`, and the entry point
    is passed as a GET parameter in `main_file_and_query`. To make these compatible
    with Kolibri's default behaviour of loading index.html, we will:
      - Rename index.html to phetindex.thml
      - Add a custom index.html that uses javascrpt redirect to phetindex.thml?{sim_id}
    """
    u = urlparse(main_file_and_query)
    idk, sim_id = u.query.split('=')
    assert idk == 'id', 'unknown query sting format found' + main_file_and_query
    main_file = u.scheme + '://' + u.netloc + u.path  # skip querystring

    destpath = tempfile.mkdtemp()
    LOGGER.info('saving phet zip file in dir ' + destpath)
    try:
        download_file(zip_file_url, destpath, request_fn=make_request)

        zip_filename = zip_file_url.split('/')[-1]
        zip_basename = zip_filename.rsplit('.', 1)[0]
        zip_folder = os.path.join(destpath, zip_basename)

        # Extract zip file contents.
        local_zip_file = os.path.join(destpath, zip_filename)
        with zipfile.ZipFile(local_zip_file) as zf:
            zf.extractall(destpath)

        # Rename main_file to phetindex.html
        main_file = main_file.split('/')[-1]
        src = os.path.join(zip_folder, main_file)
        dest = os.path.join(zip_folder, 'phetindex.html')
        os.rename(src, dest)

        # Create the
        index_html = PHET_INDEX_HTML_TEMPLATE.format(sim_id=sim_id)
        with open(os.path.join(zip_folder, 'index.html'), 'w') as indexf:
            indexf.write(index_html)

        # Always be zipping!
        return create_predictable_zip(zip_folder)

    except Exception as e:
        LOGGER.error("get_phet_zip_file: %s, %s, %s, %s" %
                     (zip_file_url, main_file_and_query, destpath, e))
        return None
Ejemplo n.º 14
0
def download_content_node(url, title):
    doc = get_parsed_html_from_url(url)

    destination = tempfile.mkdtemp()
    doc = download_static_assets(doc, destination,
            'http://migranthealth.eu/', request_fn=make_request,
            url_blacklist=url_blacklist, derive_filename=derive_filename)

    nodes_to_remove = [
        'header',
        '#page-top-header',
        '#block-region-side-pre',
        '#region-main .row-fluid .span4.heading-rts',
        '.readmoreLinks',
        '.courseSectionNext',
        'img[alt="next"]',
        '.modified',
        '.footer-rts',
        '#page-footer',
        '.back-to-top',
        '.skiplinks',
        '.linkicon',
        '.generalbox table tr:nth-of-type(2)',
    ]
    for selector in nodes_to_remove:
        for node in doc.select(selector):
            node.decompose()

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("        ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id=url,
        title=truncate_metadata(title),
        license=MEET_LICENSE,
        files=[files.HTMLZipFile(zip_path)],
        language="en",
    )
def scrape_content_page(content_page_url, lang):
    """
    Download standalone HTML content pages (non-modules).
    Used for "Curriculum framework" and standalone pages in "Resources".
    Returns:
        page_info (dict):  info necessary to constructing HTML5AppNode and HTMLZipFile
          - title
          - source_id
          - description
          - zip_path
    """
    LOGGER.debug('Scrapring content page @ url = ' + str(content_page_url))
    doc = get_parsed_html_from_url(content_page_url)

    destination = tempfile.mkdtemp()
    print('destination=', destination)

    source_id = parse_qs(urlparse(content_page_url).query)['id'][0]
    raw_title = doc.select_one("head title").text
    content_title = raw_title.replace('OLCreate:', '')\
            .replace('TESSA_ARABIC', '')\
            .replace('TESSA_Eng', '')\
            .replace('TESSA_Fr', '')\
            .strip()

    page_info = dict(
        lang=lang,
        source_id=source_id,
        title=content_title,
        description=None,
        children=[],
    )

    # Do the actual download
    download_page(content_page_url, destination, 'index.html', lang)

    # zip it
    page_info['zip_path'] = create_predictable_zip(destination)

    # ship it
    return page_info
def download_wikipedia_page(url, title, writer, thumbnail=None):
    """ Create zip file to use for html pages """
    destpath = tempfile.mkdtemp(
    )  # Create a temp directory to house our downloaded files

    # Generate details for files
    details = {
        'thumbnail': thumbnail,
        'source_id': url.split("/")[-1],
        'license': CHANNEL_LICENSE,
    }

    # Download the main wikipedia page, apply middleware processor, and call it index.html
    localref, _ = download_file(url,
                                destpath,
                                filename="index.html",
                                middleware_callbacks=process_wikipedia_page)

    zippath = create_predictable_zip(
        destpath)  # Turn the temp folder into a zip file
    writer.add_file(str(PATH), title, zippath, **details)
Ejemplo n.º 17
0
def package_html_content_as_html5_zip_file(html):
    """
    Transform the HTML markup in `html["content"]` (str) to file index.html in
    a standalone zip file. Return the neceesary metadata as a dict.
    """
    chef_tmp_dir = 'chefdata/tmp'
    webroot = tempfile.mkdtemp(dir=chef_tmp_dir)
    content = html['content']
    doc = BeautifulSoup(content, 'html5lib')
    meta = Tag(name='meta', attrs={'charset': 'utf-8'})
    doc.head.append(meta)
    # TODO: add meta language (in case of right-to-left languages)

    # Writeout new index.html
    indexhtmlpath = os.path.join(webroot, 'index.html')
    with open(indexhtmlpath, 'w') as indexfilewrite:
        indexfilewrite.write(str(doc))

    # Zip it
    zippath = create_predictable_zip(webroot)
    return zippath
Ejemplo n.º 18
0
def get_zip_file(zip_file_url, main_file):
    """HTML games are provided as zip files, the entry point of the game is
     main_file. main_file needs to be renamed to index.html to make it
     compatible with Kolibri.
    """
    destpath = tempfile.mkdtemp()
    try:
        download_file(zip_file_url, destpath, request_fn=make_request)

        zip_filename = zip_file_url.split('/')[-1]
        zip_basename = zip_filename.rsplit('.', 1)[0]
        zip_folder = os.path.join(destpath, zip_basename)

        # Extract zip file contents.
        local_zip_file = os.path.join(destpath, zip_filename)
        with zipfile.ZipFile(local_zip_file) as zf:
            zf.extractall(destpath)

        # In some cases, the files are under the www directory,
        # let's move them up one level.
        www_dir = os.path.join(zip_folder, 'www')
        if os.path.isdir(www_dir):
            files = os.listdir(www_dir)
            for f in files:
                shutil.move(os.path.join(www_dir, f), zip_folder)

        # Rename main_file to index.html.
        main_file = main_file.split('/')[-1]
        src = os.path.join(zip_folder, main_file)
        dest = os.path.join(zip_folder, 'index.html')
        os.rename(src, dest)

        return create_predictable_zip(zip_folder)
    except Exception as e:
        LOGGER.error("get_zip_file: %s, %s, %s, %s" %
                     (zip_file_url, main_file, destpath, e))
        return None
    def get_content_zip(self, page):
        """
        Get the zip path of the content.
        """
        # Find the zip url of the content and check if it's valid.
        zip_href = page.find("a", href=re.compile(".zip"))
        if not zip_href:
            return None
        zip_url = "http://proyectodescartes.org{}".format(zip_href["href"])
        zip_resp = downloader.make_request(zip_url)

        if zip_resp.status_code != 200:
            return None

        filepath = "/tmp/{}".format(zip_url.split("/")[-1])
        with open(filepath, "wb") as f:
            f.write(zip_resp.content)

        dst = tempfile.mkdtemp()
        html_name = page.find(
            "div", class_="itemFullText").find("a")["href"].split("/")[-1]

        # Unzip the downloaded zip file and zip the folder again. In case that
        # index.html does not exist on the top most level, rename the index page
        # in the folder to index.html before zipping the folder again.
        with zipfile.ZipFile(filepath) as zf:
            extracted_src = unquote(filepath.split("/")[-1].split(".zip")[0])
            zf.extractall(dst)
            if html_name != "index.html":
                src_index = os.path.join(dst, extracted_src, html_name)
                dst_index = src_index.replace(html_name, "index.html")
                if os.path.exists(src_index):
                    os.rename(src_index, dst_index)
            zip_path = create_predictable_zip(os.path.join(dst, extracted_src))

        return zip_path
def get_zip_file(zip_file_url, main_file):
    """
    HTML games are provided as zip files, the entry point of the game is `main_file`.
    THe `main_file` needs to be renamed to index.html to make it compatible with Kolibri.
    """
    key = zip_file_url + main_file
    destpath = make_temporary_dir_from_key(key)

    # Check for "REPLACE WITH:" correction rule for the current `zip_file_url`
    replacement_url = should_replace_with(zip_file_url)
    if replacement_url:
        zip_file_url = replacement_url

    # return cached version if already there
    final_webroot_path = os.path.join(destpath, 'webroot.zip')
    if os.path.exists(final_webroot_path):
        return final_webroot_path

    try:
        download_file(zip_file_url, destpath, request_fn=make_request)

        zip_filename = zip_file_url.split('/')[-1]  # e.g. Mathematics.zip
        zip_basename = zip_filename.rsplit('.', 1)[0]  # e.g. Mathematics/

        # July 31: handle ednge cases where zip filename doesn't match folder name inside it
        awazchitras = [
            'Awazchitra_HI', 'Awazchitra_TL', 'Awazchitra_KN', 'Awazchitra_BN',
            'Awazchitra_OD', 'Awazchitra_PN', 'Awazchitra_TM'
        ]
        for awazchitra in awazchitras:
            if awazchitra in zip_basename:
                zip_basename = zip_basename.replace('Awazchitra', 'AwazChitra')
        if '_KKS_Hi' in zip_basename:
            zip_basename = zip_basename.replace('_KKS_Hi', '_KKS_HI')

        # Mar 2: more edge cases where zip filename doesn't match folder name inside it
        if 'Memorygamekb' in zip_basename:
            zip_basename = zip_basename.replace('Memorygamekb', 'MemoryGamekb')
        if 'cityofstories' in zip_basename:
            zip_basename = zip_basename.replace('cityofstories',
                                                'CityOfStories')

        # Jun 12: fix more edge cases where .zip filename doesn't match dir name
        if '_KKS_Gj' in zip_basename:
            zip_basename = zip_basename.replace('_KKS_Gj', '_KKS_GJ')
        if 'ShabdKhel' in zip_basename:
            zip_basename = zip_basename.replace('ShabdKhel', 'Shabdkhel')

        zip_folder = os.path.join(destpath,
                                  zip_basename)  # e.g. destpath/Mathematics/
        main_file = main_file.split('/')[
            -1]  # e.g. activity_name.html or index.html

        if 'KhelbadiKahaniyan_MR' in zip_basename:
            # Inconsistency --- `main_file` contains dir name, and not index.html
            main_file = 'index.html'

        # Jul 8th: handle weird case-insensitive webserver main_file
        if main_file == 'mainexpand.html':
            main_file = 'mainExpand.html'  # <-- this is the actual filename in the zip

        # Zip files from Pratham website have the web content inside subfolder
        # of the same as the zip filename. We need to recreate these zip files
        # to make sure the index.html is in the root of the zip.
        local_zip_file = os.path.join(destpath, zip_filename)
        with zipfile.ZipFile(local_zip_file) as zf:
            # If main_file is in the root (like zips from the game repository)
            # then we need to extract the zip contents to subfolder zip_basename/
            for zfileinfo in zf.filelist:
                if zfileinfo.filename == main_file:
                    destpath = os.path.join(destpath, zip_basename)
            # Extract zip so main file will be in destpath/zip_basename/index.html
            zf.extractall(destpath)

        # In some cases, the files are under the www directory,
        # let's move them up one level.
        www_dir = os.path.join(zip_folder, 'www')
        if os.path.isdir(www_dir):
            files = os.listdir(www_dir)
            for f in files:
                shutil.move(os.path.join(www_dir, f), zip_folder)

        # Rename `main_file` to index.html
        src = os.path.join(zip_folder, main_file)
        dest = os.path.join(zip_folder, 'index.html')
        os.rename(src, dest)

        # Logic to add margin-top:44px; for games that match Corrections tab
        add_margin_top = False
        for row in PRADIGI_CORRECTIONS_LIST:
            if row[CORRECTIONS_ACTION_KEY] == ADD_MARGIN_TOP_ACTION:
                pat = row[CORRECTIONS_SOURCE_URL_PAT_KEY]
                m = pat.match(zip_file_url)
                if m:
                    add_margin_top = True
        if add_margin_top:
            if zip_file_url.endswith('CourseContent/Games/Mathematics.zip'):
                LOGGER.info(
                    "adding body.margin-top:44px; to ALL .html files in: %s" %
                    zip_file_url)
                for root, dirs, files in os.walk(zip_folder):
                    for file in files:
                        if file.endswith(".html"):
                            add_body_margin_top(root, file)
            else:
                LOGGER.info(
                    "adding body.margin-top:44px; to index.html in: %s" %
                    zip_file_url)
                add_body_margin_top(zip_folder, 'index.html')

        # Replace occurences of `main_file` with index.html to avoid broken links
        for root, dirs, files in os.walk(zip_folder):
            for file in files:
                if file.endswith(".html") or file.endswith(".js"):
                    file_path = os.path.join(root, file)
                    # use bytes to avoid Unicode errors "invalid start/continuation byte"
                    bytes_in = open(file_path, 'rb').read()
                    bytes_out = bytes_in.replace(main_file.encode('utf-8'),
                                                 b'index.html')
                    open(file_path, 'wb').write(bytes_out)

        # create the zip file and copy it to
        tmp_predictable_zip_path = create_predictable_zip(zip_folder)
        shutil.copyfile(tmp_predictable_zip_path, final_webroot_path)
        return final_webroot_path

    except Exception as e:
        LOGGER.error("get_zip_file: %s, %s, %s, %s" %
                     (zip_file_url, main_file, destpath, e))
        return None
Ejemplo n.º 21
0
def download_content_node(category_node,
                          url,
                          title,
                          thumbnail=None,
                          description=None):
    doc = get_parsed_html_from_url(url)

    destination = tempfile.mkdtemp()
    doc = download_static_assets(doc,
                                 destination,
                                 'https://k12.thoughtfullearning.com',
                                 request_fn=make_request,
                                 url_blacklist=url_blacklist)

    remove_node(doc, '#header')
    remove_node(doc, '.subMenuBarContainer')
    remove_node(doc, '.breadbookmarkcontainer')
    remove_node(doc, '.resourcePageTypeTitle')
    remove_node(doc, '.sharethis-wrapper')
    remove_node(doc, '.ccBlock')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block')
    remove_node(doc, '.productSuggestionContainer')
    remove_node(doc, 'footer')

    # For minilessons
    remove_node(doc, '.field-name-field-minilesson-downloadables')

    # For writing assessments
    remove_node(doc, '.assessmentTGLink')
    remove_node(doc, '.assessmentModelRubrics')
    remove_node(doc, '.view-display-id-attachment_1')

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("    ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    thumbnail_path = None
    if thumbnail:
        # Manually download the thumbnail and use it so we can lowercase the
        # extension to be accepted by Ricecooker.
        thumbnail_filename = derive_filename(thumbnail)
        thumbnail_path = os.path.join(destination, thumbnail_filename)
        download_file(thumbnail,
                      destination,
                      request_fn=make_request,
                      filename=thumbnail_filename)

    # If there is an embedded video in the page source grab it as a video node.
    video_node = None
    iframe = doc.select_one('.embedded-video iframe')
    if iframe:
        youtube_url = iframe['src']
        youtube_id = get_youtube_id_from_url(youtube_url)
        info = ydl.extract_info(youtube_url, download=False)
        video_title = info['title']
        print(
            "    ... and with video titled %s from www.youtube.com/watch?v=%s"
            % (video_title, youtube_id))
        video_node = nodes.VideoNode(
            source_id=youtube_id,
            title=truncate_metadata(info['title']),
            license=licenses.CC_BY_NC_SALicense(
                copyright_holder=truncate_metadata('Thoughtful Learning')),
            description=info['description'],
            language="en",
            derive_thumbnail=True,
            files=[files.YouTubeVideoFile(youtube_id)],
        )
        category_node.add_child(video_node)

    zip_path = create_predictable_zip(destination)
    app_node = nodes.HTML5AppNode(
        source_id=url,
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(
            copyright_holder=truncate_metadata('Thoughtful Learning')),
        description=description,
        thumbnail=thumbnail_path,
        files=[files.HTMLZipFile(zip_path)],
        language="en",
    )

    category_node.add_child(app_node)
def download_module(module_url, lang=None):
    LOGGER.debug('Scrapring module @ url = ' + module_url)
    doc = get_parsed_html_from_url(module_url)
    source_id = parse_qs(urlparse(module_url).query)['id'][0]
    raw_title = doc.select_one("head title").text
    module_title = raw_title.replace('OLCreate:', '')\
            .replace('TESSA_ARABIC', '')\
            .replace('TESSA_Eng', '')\
            .replace('TESSA_Fr', '')\
            .strip()
    module_contents_dict = dict(
        kind='TessaModuleContentsDict',
        lang=lang,
        source_id=source_id,
        title=module_title,
        children=[],
    )

    # TRY TO CREATE MODULE TOC SIDEBAR MENU
    ############################################################################
    current_li_deep = doc.find('li', class_='oucontent-tree-current')

    # Sept 5th: special treatement for modules with no TOC in sidebar
    if current_li_deep is None:
        return download_module_no_toc(module_url, lang=lang)

    # CREATE MODULE TOC SIDEBAR MENU
    # July 28 HACK : infer module_toc_li  using marker on sublist-li
    ############################################################################
    destination = tempfile.mkdtemp()
    print('destination=', destination)
    # copy css/js/images from skel
    shutil.copytree('chefdata/templates/module_skel/styles',
                    os.path.join(destination, 'styles'))

    is_first_section = True
    module_toc_li = current_li_deep.find_parent('li', class_='item-section')
    # print(module_toc_li.prettify())
    # module_contents_div = module_toc_li.find('div', class_='oucontent-contents')
    outer_module_ul = module_toc_li.find('ul',
                                         class_='child-item-list',
                                         recursive=False)
    inner_module_ul = outer_module_ul.find(
        'div', class_='oucontent-contents').find('ul', recursive=False)
    section_lis = inner_module_ul.find_all('li', recursive=False)
    print(len(section_lis))

    # DETECT IF SIMPLE MODULE (single page, so sections) OR COMPLEX MODULE (with sections)
    if len(section_lis) == 0:
        print('UNEXPECTED --------  len(section_lis) == 0')
        print(module_url, '<<< <<< ' * 6)
    if len(section_lis) == 1:
        is_simple_module = True
    else:
        is_simple_module = False

    # SIMPLE MODULES THAT CONSIST OF A SINGLE PAGE -- becomes index.html
    if is_simple_module:
        section_li = section_lis[0]
        # print('*'*120)
        # print(section_li.prettify())
        section_title_span = section_li.find('span',
                                             class_='oucontent-tree-item')
        section_title = get_text(section_title_span)
        print('Processing simple module:', section_title)
        section_dict = dict(
            kind='TessaModuleContentsSection',
            title=section_title,
            href=module_url,
            filename='index.html',  # TODO: figure out if this is necessary
            children=[],
        )
        # print('  section:', section_title)
        module_contents_dict['children'].append(section_dict)

        subsections_ul = section_li.find('ul', recursive=False)
        if subsections_ul:
            pass
            #print('found some subsections...')
        else:
            pass
            #print('no subsections <ul> found in this section')

        download_page(module_url, destination, 'index.html', lang)
    # /SIMPLE MODULE

    # COMPLEX MODULES WITH SECTIONS AND custom-made TOC in index.html
    else:
        for section_li in section_lis:

            if 'download individual sections' in get_text(
                    section_li):  # TODO: AR, SW, FR
                print(
                    'skipping section "Read or download individual sections..." '
                )
                continue

            # print(section_li.prettify())
            # print('>'*80)
            section_title_span = section_li.find('span',
                                                 class_='oucontent-tree-item')
            if section_title_span:
                if section_title_span.find('span', class_='current-title'):
                    section_href = module_url
                else:
                    section_a = section_title_span.find('a')
                    if section_a:
                        section_href = section_a['href']
                    else:
                        section_href = '#NOLINK'  # for sections like "Top 20 ideas for teaching large classes"
            else:
                section_href = '#NOLINK'  # for sections like "Read or download individual sections of the m..."

            # special case for first section --- since it doesn't save section in filename
            # manually call download_page with filename section_1.html with contents of current page
            if is_first_section:
                section_filename = 'section-1.html'
                is_first_section = False
            else:
                if '#NOLINK' not in section_href:
                    section_filename = get_section_filename(section_href)

            # accesshide_span = section_title_span.find('span', class_='accesshide')
            # if accesshide_span:
            #     accesshide_span.extract()
            # subsections_ul.extract()
            section_title = get_text(section_title_span)

            section_dict = dict(
                kind='TessaModuleContentsSection',
                title=section_title,
                href=section_href,
                filename=section_filename,
                children=[],
            )
            # print('  section:', section_title)
            module_contents_dict['children'].append(section_dict)

            subsections_ul = section_li.find('ul', recursive=False)
            if subsections_ul:
                subsection_lis = subsections_ul.find_all('li')
                for subsection_li in subsection_lis:
                    # print('<'*100)
                    # print(subsection_li)
                    #print('>>>>>')
                    #print(subsection_li.prettify())
                    subsection_link = subsection_li.find('a')
                    if not subsection_link:  # handle wrird
                        LOGGER.warning('(((((  Skipping section ' +
                                       subsection_li.get_text() +
                                       ' because no subsection_link')
                        continue
                    subsection_href = subsection_link['href']
                    subsection_filename = get_section_filename(subsection_href)
                    # subaccesshide_span = subsection_li.find('span', class_='accesshide')
                    # if subaccesshide_span:
                    #     subaccesshide_span.extract()
                    subsection_title = get_text(subsection_li)
                    subsection_dict = dict(
                        kind='TessaModuleContentsSubsection',
                        title=subsection_title,
                        href=subsection_href,
                        filename=subsection_filename,
                    )
                    # print('    subsection:', subsection_title)
                    section_dict['children'].append(subsection_dict)
            else:
                print('no subsections <ul> found in this section')

        module_index_tmpl = jinja2.Template(
            open('chefdata/templates/module_index.html').read())
        index_contents = module_index_tmpl.render(module=module_contents_dict)
        with open(os.path.join(destination, "index.html"), "w") as f:
            f.write(index_contents)

        # download the html content from each section/subsection
        for section in module_contents_dict['children']:
            if '#NOLINK' in section['href']:
                print('nothing to download for #NOLINK section')
                continue
            download_section(section['href'], destination, section['filename'],
                             lang)
            for subsection in section['children']:
                if '#NOLINK' in subsection['href']:
                    print('nothing to download for #NOLINK subsection')
                    continue
                download_section(subsection['href'], destination,
                                 subsection['filename'], lang)
        # /COMPLEX MODULE

    zip_path = create_predictable_zip(destination)
    return zip_path
def download_module_no_toc(module_url, lang=None):
    """
    Extracting the module table of contents from the sidebad nav doesn't work for certain modules in FR
    e.g. http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=105334&section=1.1

    If NO TOC is available, then we'll crawl pages one by one
    (`module_contents_dict`)
    """
    LOGGER.debug('Scrapring module @ url = ' + str(module_url))
    doc = get_parsed_html_from_url(module_url)
    destination = tempfile.mkdtemp()
    print('destination=', destination)

    # copy css/js/images from skel
    shutil.copytree('chefdata/templates/module_skel/styles',
                    os.path.join(destination, 'styles'))

    source_id = parse_qs(urlparse(module_url).query)['id'][0]
    raw_title = doc.select_one("head title").text
    module_title = raw_title.replace('OLCreate:', '')\
            .replace('TESSA_ARABIC', '')\
            .replace('TESSA_Eng', '')\
            .replace('TESSA_Fr', '')\
            .strip()

    module_contents_dict = dict(
        kind='TessaModuleContentsDict',
        source_id=source_id,
        title=module_title,
        lang=lang,
        children=[],
    )
    # print(module_contents_dict)

    # recusively download all sections by following "Next" links
    current_url = module_url
    current_section = None
    is_first_section = True
    while True:
        LOGGER.debug('processing current_url' + str(current_url))
        current_doc = get_parsed_html_from_url(current_url)

        # special handling for module-level page (no section in url but is really Section 1)
        if is_first_section:
            section_filename = 'section-1.html'
            is_first_section = False
        else:
            section_filename = get_section_filename(current_url)

        # Do the actual download
        download_section(current_url, destination, section_filename, lang)

        # Store section/subsecito info so we can build TOC later
        doc = get_parsed_html_from_url(current_url)
        raw_title = doc.select_one("head title").text
        the_title = raw_title.replace('OLCreate:', '')\
                .replace('TESSA_ARABIC', '')\
                .replace('TESSA_Eng', '')\
                .replace('TESSA_Fr', '')\
                .strip()

        # sections e.g. section-3.html
        if '_' not in section_filename:
            section_dict = dict(kind='TessaModuleContentsSection',
                                title=the_title,
                                href=current_url,
                                filename=section_filename,
                                children=[])
            module_contents_dict['children'].append(section_dict)
            print('  - section:', the_title[0:80])
            current_section = section_dict

        # subsections e.g. section-3_2.html
        else:
            subsection_title = the_title.replace(module_title, '')
            subsection_title.replace(current_section['title'], '')
            subsection_title = subsection_title.lstrip()
            if subsection_title.startswith(': '):
                subsection_title = subsection_title.replace(': ', '', 1)
            subsection_dict = dict(
                kind='TessaModuleContentsSubsection',
                title=subsection_title,
                href=current_url,
                filename=section_filename,
            )
            print('     - subsection:', subsection_title[0:80])
            current_section['children'].append(subsection_dict)

        # Recurse if next
        next_url = _get_next_section_url(current_doc)
        if next_url:
            current_url = next_url
        else:
            break

    # for debugging...
    # pp.pprint(module_contents_dict)

    module_index_tmpl = jinja2.Template(
        open('chefdata/templates/module_index.html').read())
    index_contents = module_index_tmpl.render(module=module_contents_dict)
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(index_contents)

    # return module_contents_dict
    zip_path = create_predictable_zip(destination)
    return zip_path
Ejemplo n.º 24
0
def transform_articulate_storyline_folder(contentdir, activity_ref):
    """
    Transform the contents of the folder of kind `articulate_storyline` called
    `activity_ref` located in the directory `contentdir` to adapt it to Kolibri
    plarform, package it as a zip, and return the neceesary metadata as a dict.
    """
    sourcedir = os.path.join(contentdir, activity_ref)  # source folder
    webroot = os.path.join(contentdir,
                           activity_ref + '_webroot')  # transformed dir

    if not os.path.exists(sourcedir):
        print('WWW Could not find local resource folder for activity_ref=',
              activity_ref)
        return None

    if os.path.exists(webroot):
        shutil.rmtree(webroot)

    # Copy source dir to webroot dir where we'll do the edits and transformations
    shutil.copytree(sourcedir, webroot)

    # Remove unnecessary files
    html_files_to_remove = ['story.html', 'story.swf', 'story_flash.html']
    for html_file in html_files_to_remove:
        filepath = os.path.join(webroot, html_file)
        if os.path.exists(filepath):
            os.remove(filepath)

    # Remove all .swf files from webroot/
    for root, dirs, files in os.walk(webroot):
        for file in files:
            filepath = os.path.join(root, file)
            _, ext = os.path.splitext(filepath)
            if ext == '.swf':
                os.remove(filepath)

    metapath = os.path.join(webroot, 'meta.xml')
    metaxml = open(metapath, 'r').read()
    metadoc = BeautifulSoup(metaxml, "html5lib")
    project = metadoc.find('project')
    # TODO: get author from     project > <author name="Victoria" email="" website="" />
    metadata = dict(
        kind='articulate_storyline',
        title_en=project['title'],
        source_id=activity_ref,
        thumbnail=os.path.join(webroot, project.attrs['thumburl']),
        datepublished=project['datepublished'],
        duration=project['duration'],
        totalaudio=project['totalaudio'],
        zippath=None,  # to be set below
    )

    # Setup index.html
    indexhtmlpath = os.path.join(webroot, 'index.html')
    shutil.move(os.path.join(webroot, 'story_html5.html'), indexhtmlpath)

    # load index.html
    with open(indexhtmlpath, 'r') as indexfileread:
        indexhtml = indexfileread.read()
    doc = BeautifulSoup(indexhtml, 'html5lib')

    # A. Localize js libs in <HEAD>
    scriptsdir = os.path.join(webroot, 'scripts')
    if not os.path.exists(scriptsdir):
        os.mkdir(scriptsdir)
    headscripts = doc.find('head').find_all('script')
    for script in headscripts:
        script_url = script['src']
        script_basename = os.path.basename(script_url)
        response = requests.get(script_url, verify=False)
        with open(os.path.join(scriptsdir, script_basename),
                  'wb') as scriptfile:
            scriptfile.write(response.content)
        scriptrelpath = os.path.join('scripts', script_basename)
        script['src'] = scriptrelpath

    # B. Inline css files to avoid CORS issues
    styles = doc.find('body').find_all('link', rel="stylesheet")
    for style in styles:
        style_href = style['href']
        style_path = os.path.join(webroot, style_href)
        if not os.path.exists(style_path) and 'min.css' in style_path:
            style_path = style_path.replace('min.css', 'css')
        style_content = '\n' + open(style_path).read()
        inline_style_tag = doc.new_tag('style')
        inline_style_tag['data-noprefix'] = ''
        inline_style_tag['rel'] = 'stylesheet'
        inline_style_tag.string = style_content
        style.replace_with(inline_style_tag)

    # C. Ensure that js files exist (rewrite app.min.js --> app.js if needed)
    bodyscripts = doc.find('body').find_all('script')
    for script in bodyscripts:
        if script.has_attr('src'):
            script_src = script['src']
            script_path = os.path.join(webroot, script_src)
            if not os.path.exists(script_path) and 'min.js' in script_path:
                new_script_path = script_src.replace('min.js', 'js')
                script['src'] = new_script_path
                print('    replaced script_src', script_src,
                      'with new_script_path', new_script_path)

    # Save modified index.html
    with open(indexhtmlpath, 'w') as indexfilewrite:
        indexfilewrite.write(str(doc))

    # Zip it
    localize_image_refs(webroot)
    zippath = create_predictable_zip(webroot)
    metadata['zippath'] = zippath

    return metadata
def scrape_content(title, content_url):
    """
    title: Boys' clothing
    content_url: http://www.touchableearth.org/china-culture-boys-clothing/
    """
    print("    Scraping content node: %s (%s)" % (title, content_url))

    doc = get_parsed_html_from_url(content_url)
    if not doc:  # 404
        return None

    description = create_description(doc)
    source_id = doc.select_one(".current_post.active .post_id")["value"]

    base_node_attributes = {
        "source_id": source_id,
        "title": title,
        "license": TE_LICENSE,
        "description": description,
    }

    youtube_iframe = doc.select_one(".video-container iframe")
    if youtube_iframe:
        youtube_url = doc.select_one(".video-container iframe")["src"]
        youtube_id = get_youtube_id_from_url(youtube_url)

        if not youtube_id:
            print("    *** WARNING: youtube_id not found for content url",
                  content_url)
            print("    Skipping.")
            return None

        try:
            info = ydl.extract_info(youtube_url, download=False)
            subtitles = info.get("subtitles")
            subtitle_languages = subtitles.keys() if subtitles else []
            print("      ... with subtitles in languages:", subtitle_languages)
        except youtube_dl.DownloadError as e:
            # Some of the videos have been removed from the YouTube channel --
            # skip creating content nodes for them entirely so they don't show up
            # as non-loadable videos in Kolibri.
            print("        NOTE: Skipping video download due to error: ", e)
            return None

        video_node = nodes.VideoNode(
            **base_node_attributes,
            derive_thumbnail=True,
            files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)],
        )

        # Add subtitles in whichever languages are available.
        for language in subtitle_languages:
            video_node.add_file(
                files.YouTubeSubtitleFile(youtube_id=youtube_id,
                                          language=language))

        return video_node

    img = doc.select_one(".uncode-single-media-wrapper img")
    if img:
        img_src = img["data-guid"] or img["src"]
        destination = tempfile.mkdtemp()
        download_file(img_src,
                      destination,
                      request_fn=make_request,
                      filename="image.jpg")

        with open(os.path.join(destination, "index.html"), "w") as f:
            f.write("""
                <!doctype html>
                <html>
                <head></head>
                <body>
                    <img src="image.jpg" style="width: 100%; max-width: 1200px;" />
                </body>
                </html>
            """)

        zip_path = create_predictable_zip(destination)

        return nodes.HTML5AppNode(
            **base_node_attributes,
            files=[files.HTMLZipFile(zip_path)],
            thumbnail=img_src,
        )

    return None
Ejemplo n.º 26
0
 def cool(self):
     self.zipname = create_predictable_zip(str(TEMP_FOUNDRY_ZIP))
Ejemplo n.º 27
0
def process_node_from_doc(doc, book_id, title, thumbnail):
    """
    Create a Ricecooker HTML5AppNode instance given the HTML source and metadata.
    """
    if DOWNLOAD_ONE_TO_webroot:
        # Save the book's contents to the folder `webroot` in the chef root dir.
        # Use the script ./ricecooker/utils/kolibripreview.py to preview in K
        destination = './webroot'
        if os.path.exists(destination):
            shutil.rmtree(destination)
            os.mkdir(destination)
    else:
        # Create a temporary folder to download all the files for a book
        destination = tempfile.mkdtemp()

    # Ensure the thumbnail is in a format Ricecooker can accept, and if not,
    # use the first slide as the thumbnail.
    thumbnail_extensions = ('jpg', 'jpeg', 'png')
    if not thumbnail.lower().endswith(thumbnail_extensions):
        print("Thumbnail src (%s) doesn't end in any of %s."
                " Will use the first slide as the source." % (
            thumbnail, thumbnail_extensions))
        first_slide_src = doc.select_one('#slide-container .slide img')['src']
        thumbnail = make_fully_qualified_url(first_slide_src)
        if not thumbnail.lower().endswith(thumbnail_extensions):
            thumbnail = None

    # Download all the JS/CSS/images/audio/et needed to make a standalone app
    doc = download_static_assets(doc, destination)

    # Remove a bunch of HTML that we don't want showing in our standalone app
    doc.select_one('base')['href'] = ''
    remove_node(doc, '#loading')
    remove_node(doc, '#finishedActions')
    remove_node(doc, '.bookmarkbtn')
    remove_node(doc, '.reader-expand')
    remove_node(doc, '#progressBar')
    remove_node(doc, '#androidNotification')
    remove_node(doc, '#exit')
    remove_node(doc, '#ttmenu')

    # Remove unnecessary scripts in the head
    for pat in tag_content_patterns_to_remove_in_head:
        remove_nodes_containing_pattern(doc, pat, parent_tag_name='head')
    for pat in tag_content_patterns_to_remove_in_body:
        remove_nodes_containing_pattern(doc, pat, parent_tag_name='body')
    for pat_start, pat_end in cut_start_end_patterns:
        remove_nodes_between_comments(doc, pat_start, pat_end, parent_tag_name='body')

    # Write out the HTML source
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("Downloaded book %s titled \"%s\" (thumbnail %s) to destination %s" % (
        book_id, title, thumbnail, destination))
    #preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id=book_id,
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(copyright_holder='3asafeer.com'),
        thumbnail=thumbnail,
        files=[files.HTMLZipFile(zip_path)],
        language="ar",
    )
Ejemplo n.º 28
0
def make_topic_tree_with_entrypoints(license,
                                     imscp_zip,
                                     imscp_dict,
                                     ims_dir,
                                     temp_dir=None,
                                     parent_id=None,
                                     node_options=None):
    """Return a TopicTree node from a dict of some subset of an IMSCP manifest.

    The actual IMSCP zip is marked as a dependency, and the zip loaded by Kolibri
    only contains an index.html file that redirects to the entrypoint defined in
    the manifest. This minimizes the additional content generated for Kolibri,
    and also allows us to support content where multiple content nodes have entrypoints
    defined by parameters, e.g. index.html#chapter2, index.html#chapter3, etc.

    Ready to be uploaded via Ricecooker to Studio or used in Kolibri.

    Args:
        license - License to apply to content nodes.
        imscp_dict - Dict of IMSCP from extract_from_zip or extract_from_dir.
        ims_dir (string) - Path of directory of IMSCP
        scraper_class (webmixer.HTMLPageScraper class, optional):
            Webmixer scraper class to use for pruning an HTML page.
        temp_dir (string, optional) - Full path of temporary directory to
            output HTML zip files to.
        parent_id (string, optional) - Parent ID string to concatenate to source ID.
        node_options (dict, optional) - Options to pass to content renderer in Kolibri.
    """
    if not temp_dir:
        temp_dir = tempfile.tempdir

    source_id = imscp_dict['identifier']
    assert source_id, "{} has no identifier, parent id = {}".format(
        os.path.basename(imscp_zip), parent_id)
    if parent_id:
        source_id = '{}-{}'.format(parent_id, source_id)

    if imscp_dict.get('children'):
        topic_node = nodes.TopicNode(source_id=source_id,
                                     title=imscp_dict['title'])
        counter = 1
        for child in imscp_dict['children']:
            # We will get duplicate IDs if we don't have any ID set.
            if not child['identifier']:
                child['identifier'] = 'item{}'.format(counter)
            topic_node.add_child(
                make_topic_tree_with_entrypoints(license,
                                                 imscp_zip,
                                                 child,
                                                 ims_dir,
                                                 temp_dir=temp_dir,
                                                 parent_id=source_id,
                                                 node_options=node_options))
            counter += 1
        return topic_node
    else:
        if imscp_dict['type'] == 'webcontent':
            entrypoint_dir = os.path.join(temp_dir, 'entrypoint')
            if os.path.exists(entrypoint_dir):
                shutil.rmtree(entrypoint_dir)
            os.makedirs(entrypoint_dir)
            index = os.path.join(entrypoint_dir, "index.html")
            entrypoint_url = '/zipcontent/{}/{}'.format(
                os.path.basename(imscp_zip), imscp_dict['href'])
            f = open(index, "w", encoding="utf-8")
            f.write(ENTRYPOINT_TEMPLATE.format(entrypoint_url))
            f.close()

            zip_path = create_predictable_zip(entrypoint_dir)
            html5_node = nodes.HTML5AppNode(
                source_id=source_id,
                title=imscp_dict.get('title'),
                license=license,
                files=[
                    files.HTMLZipFile(zip_path),
                    files.HTMLZipFile(
                        imscp_zip, preset=format_presets.HTML5_DEPENDENCY_ZIP)
                ],
            )
            if node_options is not None:
                extra_data = {'options': node_options}

                html5_node.extra_fields.update(extra_data)

            return html5_node
        else:
            logging.warning('Content type %s not supported yet.' %
                            imscp_dict['type'])
Ejemplo n.º 29
0
    def modify_zip(self, scorm_zip):
        """
        The SCORM modules we receive in some cases have graphics that reference UI elements that don't exist in
        Kolibri. This function modifies the zip to remove them and returns the modified zip.
        :param scorm_zip: The path to the original zip file.
        :return: Path to the modified zip file, if it exists.
        """
        zip_dir_name = os.path.splitext(os.path.basename(scorm_zip))[0]
        zip_root = os.path.join(self.temp_dir, zip_dir_name)
        output_zip = os.path.join(self.temp_dir, 'out_zips', zip_dir_name)

        os.makedirs(zip_root, exist_ok=True)
        os.makedirs(os.path.dirname(output_zip), exist_ok=True)

        zip = zipfile.ZipFile(scorm_zip)
        zip.extractall(zip_root)

        zip_changed = False
        telas_end_sprites = os.path.join(zip_root, 'curso', 'telas', 'end', 'sprites.png')
        if os.path.exists(telas_end_sprites):
            LOGGER.debug("Deleting sprites at {}".format(telas_end_sprites))
            os.remove(telas_end_sprites)
            zip_changed = True
        else:
            assert "n1_ted_len_en_u01_v02" not in scorm_zip, os.listdir(zip_root)

        for replace_img in self.replace_images:
            img_glob = glob.glob(os.path.join(zip_root, '**', replace_img), recursive=True)
            for img in img_glob:
                os.remove(img)
                shutil.copy(os.path.join(ROOT_DIR, 'assets', replace_img), img)
                if not replace_img in self.replaced_images:
                    self.replaced_images.append(replace_img)

                zip_changed = True

        # make any HTML replacements
        replaced_imgs = []
        for html_file in glob.glob(os.path.join(zip_root, '**', '*.html'), recursive=True):
            soup = BeautifulSoup(open(html_file, 'rb').read(), parser='html.parser')

            for img in self.remove_imgs:
                img_tag = soup.find('img', src = re.compile('{}$'.format(img)))
                if img_tag:
                    if not img in self.removed_imgs:
                        self.removed_imgs.append(img)
                    replaced_imgs.append(img)
                    img_tag.extract()
                    f = open(html_file, 'wb')
                    f.write(soup.prettify('utf-8'))
                    f.close()
                    zip_changed = True
                    break
                else:
                    assert img not in soup.prettify(), "Problem replacing image {} in {}".format(img, scorm_zip)

        if 'n2_tek_en_lan_u09' in scorm_zip:
            assert zip_changed, "Narrative SCORM module had no changes."
            assert 'kap_cerrar.png' in replaced_imgs, "Replaced images = {}".format(replaced_imgs)
            assert 'kap_cerrar.png' in self.removed_imgs, "Removed images = {}".format(self.removed_imgs)

        if zip_changed:
            temp_zip = create_predictable_zip(zip_root)
            scorm_zip = output_zip + '.zip'
            os.rename(temp_zip, scorm_zip)

        return scorm_zip
Ejemplo n.º 30
0
    def download_sim(self, topic, sim, keywords, language):
        """
        Download, zip, and add a node for a sim, as well as any associated video.
        """

        localized_sim = sim["localizedSimulations"][0]

        print("\tProcessing sim:", localized_sim["title"])

        dst = tempfile.mkdtemp()
        download_file(
            localized_sim["downloadUrl"],
            dst,
            filename="index.html",
            request_fn=sess.get,
            middleware_callbacks=[process_sim_html],
        )

        zippath = create_predictable_zip(dst)

        authors = re.sub(" \(.*?\)", "", sim["credits"]["designTeam"])
        authors = re.sub("<br\/?>", ", ", authors)

        title = localized_sim["title"]
        if language == "ar":
            if title in ARABIC_NAME_CATEGORY:
                title = ARABIC_NAME_CATEGORY[title]
            if title in SIM_TYPO:
                title = SIM_TYPO[title]

        # create a node for the sim
        simnode = HTML5AppNode(
            source_id="sim-%d" % localized_sim["id"],
            files=[HTMLZipFile(zippath)],
            title=title,
            description=sim["description"][language][:200],
            license=CC_BYLicense(
                "PhET Interactive Simulations, University of Colorado Boulder"
            ),
            # author=authors,
            # tags=[keywords[topic] for topic in sim["topicIds"]],
            thumbnail=sim["media"]["thumbnailUrl"],
            language=getlang(language),
        )

        # if there's a video, extract it and put it in the topic right before the sim
        videos = sim["media"]["vimeoFiles"]
        if videos:
            video_url = [v for v in videos
                         if v.get("height") == 540][0]["link"]

            videonode = VideoNode(
                source_id="video-%d" % localized_sim["id"],
                files=[VideoFile(video_url)],
                title="Video: %s" % localized_sim["title"],
                license=CC_BYLicense(
                    "PhET Interactive Simulations, University of Colorado Boulder"
                ),
                thumbnail=sim["media"]["thumbnailUrl"],
            )

            topic.add_child(videonode)

        # add the sim node into the topic
        topic.add_child(simnode)