Python HTML5AppNodeの例、ricecooker.classes.nodes.HTML5AppNode Pythonの例

コード例 #1

0

ファイルを表示

ファイル: sushichef.py プロジェクト: learningequality/sushi-chef-ceibal

def scrape_resource(url, topic):
    resource = BeautifulSoup(downloader.read(url), 'html5lib')
    LOGGER.info('      {}'.format(resource.find('h2').text))

    filepath = download_resource(resource.find('div', {'class': 'decargas'}).find('a')['href'])
    license = None
    author = ''
    for data_section in resource.find('div', {'class': 'datos_generales'}).find_all('h4'):
        if 'Licencia' in data_section.text:
            try:
                license = LICENSE_MAP[data_section.find_next_sibling('p').text](copyright_holder="Ceibal")
            except KeyError as e:
                LOGGER.error(str(e))
                license = licenses.CC_BYLicense
        elif 'Autor' in data_section.text:
            author = data_section.find_next_sibling('p').text
    if filepath:
        thumbnail = resource.find('div', {'class': 'img-recurso'}).find('img')['src']
        if thumbnail.endswith('.gif'):
            thumbnail = os.path.sep.join([DOWNLOAD_DIRECTORY, thumbnail.split('/')[-1].replace('.gif', '.png')])
            with open(thumbnail, 'wb') as fobj:
                fobj.write(downloader.read(resource.find('div', {'class': 'img-recurso'}).find('img')['src']))

        topic.add_child(nodes.HTML5AppNode(
            title=resource.find('h2').text,
            source_id=url,
            license=license,
            author=author,
            description=resource.find('form').find_all('p')[1].text,
            thumbnail=thumbnail,
            tags = [tag.text[:30] for tag in resource.find_all('a', {'class': 'tags'})],
            files=[files.HTMLZipFile(path=filepath)],
        ))

コード例 #2

0

ファイルを表示

ファイル: sushichef.py プロジェクト: learningequality/sushi-chef-exploratorium

def scrape_snack_subject(slug, topic):
    """ Scrape snack subject page
        Args:
            slug (str): url slug to scrape from (e.g. /subject/arts)
            topic (TopicNode): topic to add html nodes to
    """
    contents = BeautifulSoup(read(slug), 'html5lib')

    for activity in contents.find_all('div', {'class': 'activity'}):
        LOGGER.info("        {}".format(activity.find('h5').text.strip()))
        # Scrape snack pages into zips
        write_to_path, tags = scrape_snack_page(activity.find('a')['href'])
        if not write_to_path:
            continue

        # Create html node
        description = activity.find('div', {'class': 'pod-description'})
        topic.add_child(
            nodes.HTML5AppNode(
                source_id=activity.find('a')['href'],
                title=activity.find('h5').text.strip().replace("’", "'"),
                description=description.text.strip() if description else "",
                license=LICENSE,
                copyright_holder=COPYRIGHT_HOLDER,
                files=[files.HTMLZipFile(path=write_to_path)],
                thumbnail=get_thumbnail_url(activity.find('img')['src']),
                tags=tags,
            ))

    # Scrape next page (if any)
    next_page_url = get_next_page_url(contents)
    if next_page_url:
        scrape_snack_subject(next_page_url, topic)

コード例 #3

0

ファイルを表示

ファイル: wikipedia_sushichef.py プロジェクト: richard-dinh/sushi-chef-lets-read-asia

def download_wikipedia_page(url, thumbnail, title):
    """ Create zip file to use for html pages """
    destpath = tempfile.mkdtemp(
    )  # Create a temp directory to house our downloaded files

    # downlod the main wikipedia page, apply a middleware processor, and call it index.html
    localref, _ = download_file(
        url,
        destpath,
        filename="index.html",
        middleware_callbacks=process_wikipedia_page,
    )

    zippath = create_predictable_zip(
        destpath)  # Turn the temp folder into a zip file

    # Create an HTML5 app node
    html5app = nodes.HTML5AppNode(
        files=[files.HTMLZipFile(zippath)],
        title=title,
        thumbnail=thumbnail,
        source_id=url.split("/")[-1],
        license=CHANNEL_LICENSE,
    )

    return html5app

コード例 #4

0

ファイルを表示

ファイル: chef.py プロジェクト: learningequality/sushi-chef-blockly-games

def download_puzzle(puzzle_url, title, description, thumbnail,
                    le_language_code, blockly_language_code):
    """Download a single puzzle and return an HTML5 app node."""
    with WebDriver("https://blockly-games.appspot.com/%s" % puzzle_url,
                   delay=1000) as driver:
        doc = BeautifulSoup(driver.page_source, "html.parser")

    # Create a temporary folder to download all the files for a puzzle.
    destination = tempfile.mkdtemp()

    # Download all the JS/CSS/images/audio/etc we can get from scraping the
    # page source.
    doc = download_static_assets(doc,
                                 destination,
                                 'https://blockly-games.appspot.com',
                                 request_fn=make_request,
                                 url_blacklist=['analytics.js'])

    # Download other files not picked up by the above generic assets fetching,
    # e.g. from GitHub.
    puzzle_name = puzzle_url.split('?')[0]
    download_additional_assets(destination, puzzle_name)

    # Make some modifications to the HTML source -- hide some elements.
    remove_node(doc, '#languageMenu')
    remove_node(doc, '#title')

    # Copy over some of our own JS/CSS files and then add links to them in the
    # page source.
    copy_tree("static", os.path.join(destination, "static"))

    chef_body_script = doc.new_tag("script", src="static/chef_end_of_body.js")
    doc.select_one('body').append(chef_body_script)

    chef_head_script = doc.new_tag("script")
    chef_head_script.string = 'window["BlocklyGamesLang"] = "%s";' % blockly_language_code
    doc.select_one('head').insert(0, chef_head_script)

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print(
        "    Downloaded puzzle %s titled \"%s\" (thumbnail %s) to destination %s"
        % (puzzle_url, title, thumbnail, destination))
    # preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id=puzzle_url,
        title=truncate_metadata(title),
        description=description,
        license=licenses.PublicDomainLicense(copyright_holder='Google'),
        thumbnail=thumbnail,
        files=[files.HTMLZipFile(zip_path)],
        language=le_language_code,
    )

コード例 #5

0

ファイルを表示

def download_writing_topic_category(category_doc, title, level_id):
    destination = tempfile.mkdtemp()

    # Download a font
    font_url = make_fully_qualified_url(
        '//fonts.googleapis.com/css?family=Roboto:400,300,300italic,400italic,700,700italic'
    )
    download_file(font_url,
                  destination,
                  request_fn=make_request,
                  filename='roboto.css')

    # Write out the HTML source, based on CSS formatting from
    # https://k12.thoughtfullearning.com/resources/writingtopics

    topics = (("<li>%s</li>" % topic.text)
              for topic in category_doc.select('.views-row'))
    html_source = """
        <!DOCTYPE html>
        <head>
            <link href='roboto.css' rel='stylesheet' type='text/css'>
            <style>
                ul {
                    margin: 0 0 0 40px;
                    padding: 0;
                }
                li {
                    font-family: "Roboto", sans-serif;
                    font-weight: 300;
                    font-size: 19.2px;
                    line-height: 24.96px;
                    color: #202020;
                    margin-top: 10px;
                }
            </style>
        </head>
        <body>
            <ul>%s</ul>
        </body>
    """ % ''.join(topics)

    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(html_source)

    print("    ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id="%s|%s" % (level_id, title),
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(
            copyright_holder=truncate_metadata('Thoughtful Learning')),
        files=[files.HTMLZipFile(zip_path)],
        language="en",
        thumbnail=writing_topic_thumbnail,
    )

コード例 #6

0

ファイルを表示

    def construct_channel(self, *args, **kwargs):
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        lang_names = list(self.data.keys())
        lang_names.sort()

        for lang_name in lang_names:
            lang_data = self.data[lang_name]
            LOGGER.info("Creating app for language: {}".format(lang_name))
            lang = languages.getlang_by_native_name(lang_name)

            zip_dir = self.client.create_zip_dir_for_page(lang_data['url'])

            soup = self.client.get_page_soup(lang_data['url'])

            # Remove the translation list if found
            translations = soup.find('div', {'id': 'translations'})
            if translations:
                translations.extract()

            # Grab the localized title
            title = soup.find('span', {'id': 'share_title'}).text

            # Save the modified index.html page
            thumbnail = None
            for resource in lang_data['resources']:
                if 'dp3t.png' in resource:
                    thumbnail = os.path.join(zip_dir, resource)
                    break

            with open(os.path.join(zip_dir, 'index.html'), 'wb') as f:
                f.write(soup.prettify(encoding='utf-8'))

            # create_predictable_zip ensures that the ZIP file does not change each time it's created. This
            # ensures that the zip doesn't get re-uploaded just because zip metadata changed.
            zip_file = zip.create_predictable_zip(zip_dir)
            zip_name = lang.primary_code if lang else lang_name
            zip_filename = os.path.join(self.ZIP_DIR,
                                        "{}.zip".format(zip_name))
            os.makedirs(os.path.dirname(zip_filename), exist_ok=True)
            os.rename(zip_file, zip_filename)

            topic = nodes.TopicNode(source_id=lang_name, title=lang_name)
            zip_node = nodes.HTML5AppNode(
                source_id="covid19-sim-{}".format(lang_name),
                title=title,
                files=[files.HTMLZipFile(zip_filename)],
                license=licenses.PublicDomainLicense(
                    "Marcel Salathé & Nicky Case"),
                language=lang,
                thumbnail=thumbnail)
            topic.add_child(zip_node)
            channel.add_child(topic)

        return channel

コード例 #7

0

ファイルを表示

ファイル: who.py プロジェクト: learningequality/sushi-chef-who-covid-advice

 def to_contentnode(self, title, directory=None, *args, **kwargs):
     # Generate a node based on the kind attribute
     filepath = self.to_file(directory=directory)
     if self.kind == content_kinds.HTML5:
         return nodes.HTML5AppNode(source_id=self.url,
                                   title=title,
                                   files=[files.HTMLZipFile(filepath)],
                                   **kwargs)
     elif self.kind == content_kinds.VIDEO:
         return nodes.VideoNode(source_id=self.url,
                                title=title,
                                files=[files.VideoFile(filepath)],
                                **kwargs)

コード例 #8

0

ファイルを表示

ファイル: chef.py プロジェクト: learningequality/sushi-chef-ekshiksha

    def create_topic_nodes_recursive(self, topic_info):
        """
        Create nodes for all the content items in the tree. Currently supports HTML5 app node and topic node creation.

        :param topic_info: Dictionary with information about the current topic to use for generating nodes.
        :return: A TopicNode of the node topic_info along with all child topics and nodes.
        """
        topic_node = nodes.TopicNode(source_id=str(topic_info['id']),
                                     title=topic_info['text'])

        has_content = False
        if 'nodes' in topic_info:
            has_content = True
            topic_nodes = topic_info['nodes']
            for anode in topic_nodes:
                node_files = [files.HTMLZipFile(anode['html5_zip'])]
                if 'needs_dep_zip' in anode and anode['needs_dep_zip']:
                    print("Needs dep zip: {}".format(anode))
                    node_files.append(self.dep_zip_file)
                html_node = nodes.HTML5AppNode(files=node_files,
                                               title=anode['title'],
                                               source_id=anode['dir'],
                                               license=licenses.CC_BY_NC,
                                               copyright_holder="ekShiksha")
                if 'description' in anode:
                    html_node.description = anode['description']

                # One possible way to store metadata about each content node on Studio.
                # extra_fields = {'metadata' : {}}
                # metadata = extra_fields['metadata']
                # metadata['grades'] = [{'curriculum': 'CBSE', 'grades': [int(anode['standard'])] }]
                # metadata['subject'] = topic_info['text']
                # TODO: Add the topic tree as 'categories'

                topic_node.add_child(html_node)

        if 'subtopics' in topic_info:
            for subtopic in topic_info['subtopics']:
                child = self.create_topic_nodes_recursive(subtopic)
                if child:
                    has_content = True
                    topic_node.add_child(child)

        # This shouldn't happen, so output a warning if it does.
        if not has_content:
            print("Node {} has no content".format(topic_info))
            return None

        return topic_node

コード例 #9

0

ファイルを表示

ファイル: ricecooker_utils.py プロジェクト: learningequality/imscp

def create_html5_app_node(license,
                          content_dict,
                          ims_dir,
                          scraper_class=None,
                          temp_dir=None,
                          needs_scorm_support=False):
    if scraper_class:
        index_path = os.path.join(ims_dir, content_dict['index_file'])

        if '?' in index_path:
            index_path = index_path.split('?')[0]
        if '#' in index_path:
            index_path = index_path.split('#')[0]
        if content_dict['scormtype'] == 'sco' and needs_scorm_support:
            add_scorm_support(index_path, ims_dir)

        index_uri = pathlib.Path(os.path.abspath(index_path)).as_uri()
        zip_name = '%s.zip' % hashlib.md5(
            index_uri.encode('utf-8')).hexdigest()
        temp_dir = temp_dir if temp_dir else tempfile.gettempdir()
        zip_path = os.path.join(temp_dir, zip_name)
        scraper = scraper_class(index_uri)
        scraper.download_file(zip_path)
        logging.info('Webmixer scraper outputted HTML app to %s' % zip_path)

    else:
        with tempfile.TemporaryDirectory() as destination:
            index_src_path = os.path.join(ims_dir, content_dict['index_file'])
            index_dest_path = os.path.join(destination, 'index.html')
            shutil.copyfile(index_src_path, index_dest_path)

            for file_path in content_dict['files']:
                shutil.copy(os.path.join(ims_dir, file_path), destination)

            if content_dict.get('scormtype') == 'sco' and needs_scorm_support:
                add_scorm_support(index_dest_path, destination)

            #preview_in_browser(destination)
            zip_path = create_predictable_zip(destination)

    return nodes.HTML5AppNode(
        source_id=content_dict['identifier'],
        title=content_dict.get('title'),
        license=license,
        files=[files.HTMLZipFile(zip_path)],
    )

コード例 #10

0

ファイルを表示

ファイル: ricecooker_utils.py プロジェクト: divad12/imscp

def create_html5_app_node(license, content_dict):
    with tempfile.TemporaryDirectory() as destination:
        index_copy_path = os.path.join(destination, 'index.html')
        shutil.copyfile(content_dict['index_file'], index_copy_path)

        for file_path in content_dict['files']:
            shutil.copy(file_path, destination)

        #preview_in_browser(destination)

        zip_path = create_predictable_zip(destination)
        return nodes.HTML5AppNode(
            source_id=content_dict['identifier'],
            title=content_dict.get('title'),
            license=license,
            files=[files.HTMLZipFile(zip_path)],
        )

コード例 #11

0

ファイルを表示

ファイル: sushichef.py プロジェクト: learningequality/sushi-chef-saylor

def scrape_book(url, license):
    """ Scrape book and return html node
        e.g. https://saylordotorg.github.io/text_financial-accounting/
    """
    page = BeautifulSoup(read_source(url), 'html.parser')

    if not page.find('div', {'id': 'book-content'
                             }):  # Skip books that link to other websites
        return

    # Get fields for new html node
    title = page.find('h1').text.replace(u'\xa0', u' ').replace('\n', '')
    source_id = generate_id(title)
    write_to_path = "{}{}{}.zip".format(DOWNLOAD_DIRECTORY, os.path.sep,
                                        source_id)
    LOGGER.info("    " + title)

    # Write to html zip
    # if not os.path.isfile(write_to_path):
    with html.HTMLWriter(write_to_path) as zipper:
        # Parse table of contents
        contents = BeautifulSoup(read_source(url), 'html.parser')
        parse_page_links(url, contents, zipper)

        # Parse all links in the table of contents
        for link in contents.find_all('a'):
            if link.get('href'):
                # Get page content and write to zip
                chapter_contents = BeautifulSoup(
                    read_source(url, endpoint=link['href']), 'html.parser')
                parse_page_links(url, chapter_contents, zipper, link['href'])
                zipper.write_contents(link['href'],
                                      chapter_contents.prettify())

        # Write main index.html file and all shared files
        zipper.write_index_contents(contents.prettify())
        write_shared_library_to_zip(zipper)

    return nodes.HTML5AppNode(source_id=source_id,
                              title=title,
                              license=license,
                              copyright_holder=COPYRIGHT_HOLDER,
                              files=[files.HTMLZipFile(path=write_to_path)])

コード例 #12

0

ファイルを表示

def download_content_node(url, title):
    doc = get_parsed_html_from_url(url)

    destination = tempfile.mkdtemp()
    doc = download_static_assets(doc, destination,
            'http://migranthealth.eu/', request_fn=make_request,
            url_blacklist=url_blacklist, derive_filename=derive_filename)

    nodes_to_remove = [
        'header',
        '#page-top-header',
        '#block-region-side-pre',
        '#region-main .row-fluid .span4.heading-rts',
        '.readmoreLinks',
        '.courseSectionNext',
        'img[alt="next"]',
        '.modified',
        '.footer-rts',
        '#page-footer',
        '.back-to-top',
        '.skiplinks',
        '.linkicon',
        '.generalbox table tr:nth-of-type(2)',
    ]
    for selector in nodes_to_remove:
        for node in doc.select(selector):
            node.decompose()

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("        ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id=url,
        title=truncate_metadata(title),
        license=MEET_LICENSE,
        files=[files.HTMLZipFile(zip_path)],
        language="en",
    )

コード例 #13

0

ファイルを表示

ファイル: ricecooker_utils.py プロジェクト: learningequality/imscp

def make_topic_tree_with_entrypoints(license,
                                     imscp_zip,
                                     imscp_dict,
                                     ims_dir,
                                     temp_dir=None,
                                     parent_id=None,
                                     node_options=None):
    """Return a TopicTree node from a dict of some subset of an IMSCP manifest.

    The actual IMSCP zip is marked as a dependency, and the zip loaded by Kolibri
    only contains an index.html file that redirects to the entrypoint defined in
    the manifest. This minimizes the additional content generated for Kolibri,
    and also allows us to support content where multiple content nodes have entrypoints
    defined by parameters, e.g. index.html#chapter2, index.html#chapter3, etc.

    Ready to be uploaded via Ricecooker to Studio or used in Kolibri.

    Args:
        license - License to apply to content nodes.
        imscp_dict - Dict of IMSCP from extract_from_zip or extract_from_dir.
        ims_dir (string) - Path of directory of IMSCP
        scraper_class (webmixer.HTMLPageScraper class, optional):
            Webmixer scraper class to use for pruning an HTML page.
        temp_dir (string, optional) - Full path of temporary directory to
            output HTML zip files to.
        parent_id (string, optional) - Parent ID string to concatenate to source ID.
        node_options (dict, optional) - Options to pass to content renderer in Kolibri.
    """
    if not temp_dir:
        temp_dir = tempfile.tempdir

    source_id = imscp_dict['identifier']
    assert source_id, "{} has no identifier, parent id = {}".format(
        os.path.basename(imscp_zip), parent_id)
    if parent_id:
        source_id = '{}-{}'.format(parent_id, source_id)

    if imscp_dict.get('children'):
        topic_node = nodes.TopicNode(source_id=source_id,
                                     title=imscp_dict['title'])
        counter = 1
        for child in imscp_dict['children']:
            # We will get duplicate IDs if we don't have any ID set.
            if not child['identifier']:
                child['identifier'] = 'item{}'.format(counter)
            topic_node.add_child(
                make_topic_tree_with_entrypoints(license,
                                                 imscp_zip,
                                                 child,
                                                 ims_dir,
                                                 temp_dir=temp_dir,
                                                 parent_id=source_id,
                                                 node_options=node_options))
            counter += 1
        return topic_node
    else:
        if imscp_dict['type'] == 'webcontent':
            entrypoint_dir = os.path.join(temp_dir, 'entrypoint')
            if os.path.exists(entrypoint_dir):
                shutil.rmtree(entrypoint_dir)
            os.makedirs(entrypoint_dir)
            index = os.path.join(entrypoint_dir, "index.html")
            entrypoint_url = '/zipcontent/{}/{}'.format(
                os.path.basename(imscp_zip), imscp_dict['href'])
            f = open(index, "w", encoding="utf-8")
            f.write(ENTRYPOINT_TEMPLATE.format(entrypoint_url))
            f.close()

            zip_path = create_predictable_zip(entrypoint_dir)
            html5_node = nodes.HTML5AppNode(
                source_id=source_id,
                title=imscp_dict.get('title'),
                license=license,
                files=[
                    files.HTMLZipFile(zip_path),
                    files.HTMLZipFile(
                        imscp_zip, preset=format_presets.HTML5_DEPENDENCY_ZIP)
                ],
            )
            if node_options is not None:
                extra_data = {'options': node_options}

                html5_node.extra_fields.update(extra_data)

            return html5_node
        else:
            logging.warning('Content type %s not supported yet.' %
                            imscp_dict['type'])

コード例 #14

0

ファイルを表示

    def construct_channel(self, **kwargs):
        # create channel
        channel = self.get_channel(**kwargs)
        # create a topic and add it to channel
        for grade in GRADES:
            grade_node = nodes.TopicNode(
                source_id=str(grade),
                title="Grade {grade}".format(grade=grade),
                description="",
            )
            channel.add_child(grade_node)

            filename = localise.make_local(
                BASE_URL.format(grade=grade, target='teachers') +
                "/teacher_course_guide.html")
            print(filename)
            file = HTMLZipFile(filename)

            course_guide_node = nodes.HTML5AppNode(
                source_id="{grade}-teachers-teacher_course_guide".format(
                    grade=grade),
                title="Grade {grade} Teacher Course Guide".format(grade=grade),
                license=licenses.CC_BY_NC_SA,
                copyright_holder="Open Up Resources",
                #author="Open Up Resources",
                #description="",
                #thumbnail="",
                #extra_fields={},
                #domain_ns="",
                files=[file],
            )
            grade_node.add_child(course_guide_node)

            filename = localise.make_local(
                BASE_URL.format(grade=grade, target="students") + "/1/1.html")
            print(filename)
            file = HTMLZipFile(filename)

            course_guide_node = nodes.HTML5AppNode(
                source_id="{grade}-students-1-1".format(grade=grade),
                title="Grade {grade} 1-1".format(grade=grade),
                license=licenses.CC_BY_NC_SA,
                copyright_holder="Open Up Resources",
                #author="Open Up Resources",
                #description="",
                #thumbnail="",
                #extra_fields={},
                #domain_ns="",
                files=[file],
            )
            grade_node.add_child(course_guide_node)
            """6/teachers/1.html -- has description of this topic; has drop down list of lessons within it
            6/teachers/1/1.html -- Is a lesson plan.
            6/teachers/1/assessments/unit_assessments.html -- broken
            6/teachers/1/practice_problems.html -- practice problems for all lessons w/solutons
            6/teachers/1/downloads.html -- 7x links to pdfs/zips of pdfs
            6/teachers/1/family_materials.html -- same as family? (YES) topicwide
            6/teachers/teacher_course_guide.html -- single page per year

            6/families/1.html -- same as teachers / family materials

            6/students/1/1.html -- is student resources.
            6/students/1/practice_problems.html - nothing complex
            6/students/1/glossary.html - nothing complex
            6/students/1/my_reflections.html - nothing complex    """

        return channel

コード例 #15

0

ファイルを表示

def scrape_page(exp_id, language, subject_node):
    # format to appropriate url depending on language
    my_downloader = downloader.ArchiveDownloader(EXPERIMENTS_FOLDER)
    url = format_url(exp_id, language)
    # page = downloader.archive_page(url, EXPERIMENTS_FOLDER)
    page = my_downloader.get_page(url, refresh=True)
    my_zip_dir = my_downloader.create_zip_dir_for_page(url)
    index_file = os.path.join(my_zip_dir, 'index.html')
    # entry = page['index_path']
    zip_path_entry = os.path.relpath(index_file,
                                     os.path.join('chefdata', 'experiments'))

    soup = BeautifulSoup(open(index_file, encoding='utf-8'), 'html.parser')

    # get title
    visible_SRAtitle = soup.find('h1', {'class': 'SRAtitle'})
    title = visible_SRAtitle.get_text(strip=True)

    # get tags
    visible_SRAtd = soup.findAll('div', {'class': 'SRAtd'})
    visible_tags = visible_SRAtd[-1]
    tags_arr = []
    for a_tags in visible_tags.findAll('a'):
        tag = a_tags.get_text(strip=True)
        # remove special characters
        tag = re.sub(r"[^a-zA-Z0-9]+", ' ', tag)
        # removing ending whitespace
        tag = tag.rstrip()
        tags_arr.append(tag)

    # remove navbar
    navbar = soup.find('nav')
    navbar.decompose()

    # remove footer
    footer = soup.find('footer')
    footer.decompose()

    # remove all hrefs
    for a_tag in soup.findAll('a'):
        del a_tag['href']
        # move all children of a tag to parent
        a_tag.replaceWithChildren()

    # write updated soup to html file
    soup_str = str(soup)
    # html_file = open(entry, 'w', encoding = 'utf-8')
    html_file = open(index_file, 'w', encoding='utf-8')
    html_file.write(soup_str)
    html_file.close()

    # zippath = zip.create_predictable_zip(EXPERIMENTS_FOLDER, zip_path_entry)
    zippath = zip.create_predictable_zip(my_zip_dir)
    # copy zippath to temp folder here if necessary
    shutil.copy(zippath, TEMP_FOLDER)
    html5_node = nodes.HTML5AppNode(
        source_id='{0}_{1}'.format(language, url),
        files=[files.HTMLZipFile(zippath)],
        title=title,
        description='',
        license=licenses.CC_BYLicense('Sciensation'),
        language=language,
        thumbnail=None,
        author='Sciensation',
        tags=tags_arr)
    subject_node.add_child(html5_node)
    return subject_node

コード例 #16

0

ファイルを表示

def build_tree_from_json(parent_node, sourcetree):
    """
    Recusively parse nodes in the list `sourcetree` and add them as children
    to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`.
    """
    EXPECTED_NODE_TYPES = [
        TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, DOCUMENT_NODE,
        HTML5_NODE
    ]

    for source_node in sourcetree:
        kind = source_node['kind']
        if kind not in EXPECTED_NODE_TYPES:
            LOGGER.critical('Unexpected node type found: ' + kind)
            raise NotImplementedError(
                'Unexpected node type found in json data.')

        if kind == TOPIC_NODE:
            child_node = nodes.TopicNode(
                source_id=source_node.get("source_id", None),
                title=source_node["title"],
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
            )
            parent_node.add_child(child_node)
            source_tree_children = source_node.get("children", [])
            build_tree_from_json(child_node, source_tree_children)

        elif kind == VIDEO_NODE:
            child_node = nodes.VideoNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                derive_thumbnail=source_node.get(
                    'derive_thumbnail', True),  # video-specific option
                thumbnail=source_node.get('thumbnail'),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        elif kind == AUDIO_NODE:
            child_node = nodes.AudioNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get('thumbnail'),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        elif kind == EXERCISE_NODE:
            child_node = nodes.ExerciseNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
                questions=[],
            )
            add_questions(child_node, source_node.get("questions") or [])
            parent_node.add_child(child_node)

        elif kind == DOCUMENT_NODE:
            child_node = nodes.DocumentNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        elif kind == HTML5_NODE:
            child_node = nodes.HTML5AppNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        else:
            LOGGER.critical("Encountered an unknown kind: " + str(source_node))
            continue

    return parent_node

コード例 #17

0

ファイルを表示

ファイル: chef.py プロジェクト: learningequality/sushi-chef-3asafeer

def process_node_from_doc(doc, book_id, title, thumbnail):
    """
    Create a Ricecooker HTML5AppNode instance given the HTML source and metadata.
    """
    if DOWNLOAD_ONE_TO_webroot:
        # Save the book's contents to the folder `webroot` in the chef root dir.
        # Use the script ./ricecooker/utils/kolibripreview.py to preview in K
        destination = './webroot'
        if os.path.exists(destination):
            shutil.rmtree(destination)
            os.mkdir(destination)
    else:
        # Create a temporary folder to download all the files for a book
        destination = tempfile.mkdtemp()

    # Ensure the thumbnail is in a format Ricecooker can accept, and if not,
    # use the first slide as the thumbnail.
    thumbnail_extensions = ('jpg', 'jpeg', 'png')
    if not thumbnail.lower().endswith(thumbnail_extensions):
        print("Thumbnail src (%s) doesn't end in any of %s."
                " Will use the first slide as the source." % (
            thumbnail, thumbnail_extensions))
        first_slide_src = doc.select_one('#slide-container .slide img')['src']
        thumbnail = make_fully_qualified_url(first_slide_src)
        if not thumbnail.lower().endswith(thumbnail_extensions):
            thumbnail = None

    # Download all the JS/CSS/images/audio/et needed to make a standalone app
    doc = download_static_assets(doc, destination)

    # Remove a bunch of HTML that we don't want showing in our standalone app
    doc.select_one('base')['href'] = ''
    remove_node(doc, '#loading')
    remove_node(doc, '#finishedActions')
    remove_node(doc, '.bookmarkbtn')
    remove_node(doc, '.reader-expand')
    remove_node(doc, '#progressBar')
    remove_node(doc, '#androidNotification')
    remove_node(doc, '#exit')
    remove_node(doc, '#ttmenu')

    # Remove unnecessary scripts in the head
    for pat in tag_content_patterns_to_remove_in_head:
        remove_nodes_containing_pattern(doc, pat, parent_tag_name='head')
    for pat in tag_content_patterns_to_remove_in_body:
        remove_nodes_containing_pattern(doc, pat, parent_tag_name='body')
    for pat_start, pat_end in cut_start_end_patterns:
        remove_nodes_between_comments(doc, pat_start, pat_end, parent_tag_name='body')

    # Write out the HTML source
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("Downloaded book %s titled \"%s\" (thumbnail %s) to destination %s" % (
        book_id, title, thumbnail, destination))
    #preview_in_browser(destination)

    zip_path = create_predictable_zip(destination)
    return nodes.HTML5AppNode(
        source_id=book_id,
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(copyright_holder='3asafeer.com'),
        thumbnail=thumbnail,
        files=[files.HTMLZipFile(zip_path)],
        language="ar",
    )

コード例 #18

0

ファイルを表示

ファイル: te_chef.py プロジェクト: learningequality/sushi-chef-touchableearth

def scrape_content(title, content_url):
    """
    title: Boys' clothing
    content_url: http://www.touchableearth.org/china-culture-boys-clothing/
    """
    print("    Scraping content node: %s (%s)" % (title, content_url))

    doc = get_parsed_html_from_url(content_url)
    if not doc:  # 404
        return None

    description = create_description(doc)
    source_id = doc.select_one(".current_post.active .post_id")["value"]

    base_node_attributes = {
        "source_id": source_id,
        "title": title,
        "license": TE_LICENSE,
        "description": description,
    }

    youtube_iframe = doc.select_one(".video-container iframe")
    if youtube_iframe:
        youtube_url = doc.select_one(".video-container iframe")["src"]
        youtube_id = get_youtube_id_from_url(youtube_url)

        if not youtube_id:
            print("    *** WARNING: youtube_id not found for content url",
                  content_url)
            print("    Skipping.")
            return None

        try:
            info = ydl.extract_info(youtube_url, download=False)
            subtitles = info.get("subtitles")
            subtitle_languages = subtitles.keys() if subtitles else []
            print("      ... with subtitles in languages:", subtitle_languages)
        except youtube_dl.DownloadError as e:
            # Some of the videos have been removed from the YouTube channel --
            # skip creating content nodes for them entirely so they don't show up
            # as non-loadable videos in Kolibri.
            print("        NOTE: Skipping video download due to error: ", e)
            return None

        video_node = nodes.VideoNode(
            **base_node_attributes,
            derive_thumbnail=True,
            files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)],
        )

        # Add subtitles in whichever languages are available.
        for language in subtitle_languages:
            video_node.add_file(
                files.YouTubeSubtitleFile(youtube_id=youtube_id,
                                          language=language))

        return video_node

    img = doc.select_one(".uncode-single-media-wrapper img")
    if img:
        img_src = img["data-guid"] or img["src"]
        destination = tempfile.mkdtemp()
        download_file(img_src,
                      destination,
                      request_fn=make_request,
                      filename="image.jpg")

        with open(os.path.join(destination, "index.html"), "w") as f:
            f.write("""
                <!doctype html>
                <html>
                <head></head>
                <body>
                    <img src="image.jpg" style="width: 100%; max-width: 1200px;" />
                </body>
                </html>
            """)

        zip_path = create_predictable_zip(destination)

        return nodes.HTML5AppNode(
            **base_node_attributes,
            files=[files.HTMLZipFile(zip_path)],
            thumbnail=img_src,
        )

    return None

コード例 #19

0

ファイルを表示

def download_content_node(category_node,
                          url,
                          title,
                          thumbnail=None,
                          description=None):
    doc = get_parsed_html_from_url(url)

    destination = tempfile.mkdtemp()
    doc = download_static_assets(doc,
                                 destination,
                                 'https://k12.thoughtfullearning.com',
                                 request_fn=make_request,
                                 url_blacklist=url_blacklist)

    remove_node(doc, '#header')
    remove_node(doc, '.subMenuBarContainer')
    remove_node(doc, '.breadbookmarkcontainer')
    remove_node(doc, '.resourcePageTypeTitle')
    remove_node(doc, '.sharethis-wrapper')
    remove_node(doc, '.ccBlock')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block')
    remove_node(doc, '.productSuggestionContainer')
    remove_node(doc, 'footer')

    # For minilessons
    remove_node(doc, '.field-name-field-minilesson-downloadables')

    # For writing assessments
    remove_node(doc, '.assessmentTGLink')
    remove_node(doc, '.assessmentModelRubrics')
    remove_node(doc, '.view-display-id-attachment_1')

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("    ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    thumbnail_path = None
    if thumbnail:
        # Manually download the thumbnail and use it so we can lowercase the
        # extension to be accepted by Ricecooker.
        thumbnail_filename = derive_filename(thumbnail)
        thumbnail_path = os.path.join(destination, thumbnail_filename)
        download_file(thumbnail,
                      destination,
                      request_fn=make_request,
                      filename=thumbnail_filename)

    # If there is an embedded video in the page source grab it as a video node.
    video_node = None
    iframe = doc.select_one('.embedded-video iframe')
    if iframe:
        youtube_url = iframe['src']
        youtube_id = get_youtube_id_from_url(youtube_url)
        info = ydl.extract_info(youtube_url, download=False)
        video_title = info['title']
        print(
            "    ... and with video titled %s from www.youtube.com/watch?v=%s"
            % (video_title, youtube_id))
        video_node = nodes.VideoNode(
            source_id=youtube_id,
            title=truncate_metadata(info['title']),
            license=licenses.CC_BY_NC_SALicense(
                copyright_holder=truncate_metadata('Thoughtful Learning')),
            description=info['description'],
            language="en",
            derive_thumbnail=True,
            files=[files.YouTubeVideoFile(youtube_id)],
        )
        category_node.add_child(video_node)

    zip_path = create_predictable_zip(destination)
    app_node = nodes.HTML5AppNode(
        source_id=url,
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(
            copyright_holder=truncate_metadata('Thoughtful Learning')),
        description=description,
        thumbnail=thumbnail_path,
        files=[files.HTMLZipFile(zip_path)],
        language="en",
    )

    category_node.add_child(app_node)

コード例 #20

0

ファイルを表示

ファイル: sample_program.py プロジェクト: benjaoming/ricecooker

def _build_tree(node, sourcetree):
    """
    Parse nodes given in `sourcetree` and add as children of `node`.
    """
    for child_source_node in sourcetree:
        try:
            main_file = child_source_node['files'][0] if 'files' in child_source_node else {}
            kind = guess_content_kind(path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions"))
        except UnknownContentKindError:
            continue

        if kind == content_kinds.TOPIC:
            child_node = nodes.TopicNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            node.add_child(child_node)

            source_tree_children = child_source_node.get("children", [])

            _build_tree(child_node, source_tree_children)

        elif kind == content_kinds.VIDEO:
            child_node = nodes.VideoNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=get_license(child_source_node.get("license"), description="Description of license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                derive_thumbnail=True, # video-specific data
                thumbnail=child_source_node.get('thumbnail'),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        elif kind == content_kinds.AUDIO:
            child_node = nodes.AudioNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        elif kind == content_kinds.DOCUMENT:
            child_node = nodes.DocumentNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        elif kind == content_kinds.EXERCISE:
            child_node = nodes.ExerciseNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                exercise_data={}, # Just set to default
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            for q in child_source_node.get("questions"):
                question = create_question(q)
                child_node.add_question(question)
            node.add_child(child_node)

        elif kind == content_kinds.HTML5:
            child_node = nodes.HTML5AppNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        else:                   # unknown content file format
            continue

    return node

コード例 #21

0

ファイルを表示

def build_tree_from_json(parent_node, sourcetree):
    """
    Recusively parse nodes in the list `sourcetree` and add them as children
    to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`.
    """
    EXPECTED_NODE_TYPES = [TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE,
                           DOCUMENT_NODE, HTML5_NODE, SLIDESHOW_NODE]

    for source_node in sourcetree:
        kind = source_node['kind']
        if kind not in EXPECTED_NODE_TYPES:
            LOGGER.critical('Unexpected node kind found: ' + kind)
            raise NotImplementedError('Unexpected node kind found in json data.')

        if kind == TOPIC_NODE:
            child_node = nodes.TopicNode(
                source_id=source_node.get('source_id', None),
                title=source_node['title'],
                description=source_node.get('description'),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                # no role for topics (computed dynaically from descendants)
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            parent_node.add_child(child_node)
            source_tree_children = source_node.get('children', [])
            build_tree_from_json(child_node, source_tree_children)

        elif kind == VIDEO_NODE:
            child_node = nodes.VideoNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                derive_thumbnail=source_node.get('derive_thumbnail', True),  # video-specific option
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == AUDIO_NODE:
            child_node = nodes.AudioNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == EXERCISE_NODE:
            child_node = nodes.ExerciseNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
                exercise_data=source_node.get('exercise_data'),
                questions=[],
            )
            add_questions(child_node, source_node.get('questions') or [])
            parent_node.add_child(child_node)

        elif kind == DOCUMENT_NODE:
            child_node = nodes.DocumentNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == HTML5_NODE:
            child_node = nodes.HTML5AppNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == SLIDESHOW_NODE:
            child_node = nodes.SlideshowNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags')
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        else:
            LOGGER.critical('Encountered an unknown kind: ' + str(source_node))
            continue

    return parent_node