def fetch_video(video):
    youtube_id = video['id']
    title = video['title']
    description = video['description']
    youtube_url = video['webpage_url']
    subtitle_languages = video['subtitles'].keys()

    print("    Fetching video data: %s (%s)" % (title, youtube_url))

    video_node = nodes.VideoNode(
        source_id=youtube_id,
        title=truncate_metadata(title),
        license=LICENSE,
        description=truncate_description(description),
        derive_thumbnail=True,
        language="en",
        files=[files.YouTubeVideoFile(youtube_id=youtube_id)],
    )

    # Add subtitles in whichever languages are available.
    for language in subtitle_languages:
        # TODO(david): Should catch exception thrown by
        # files.YouTubeSubtitleFile rather than breaking abstraction.
        if languages.getlang(language) or languages.getlang_by_alpha2(
                language):
            video_node.add_file(
                files.YouTubeSubtitleFile(youtube_id=youtube_id,
                                          language=language))
        else:
            print("WARNING: Subtitle language %s not found in languages file" %
                  language)

    return video_node
Exemple #2
0
    def add_content_to_tree(self, channel):
        tree = self.channel_tree
        lang = 'English'
        lang_obj = getlang("en")
        for class_name in tree[lang]:
            class_obj = tree[lang][class_name]
            class_id = "{}-{}".format(lang, class_name)
            class_node = nodes.TopicNode(source_id=class_name, title=class_name)
            for subject_name in class_obj:
                subject_id = "{}-{}".format(class_id, subject_name)
                subject_node = nodes.TopicNode(source_id=subject_id, title=subject_name)
                subject_obj = class_obj[subject_name]
                for item in subject_obj['items']:
                    item_id = "{}-{}".format(subject_id, get_column(item, 'id'))
                    video = nodes.VideoNode(
                        source_id=item_id,
                        title=get_column(item, 'name'),
                        description=get_column(item, 'description'),
                        files=[
                            files.VideoFile(path=get_column(item, 'file'))
                        ],
                        language=lang_obj,
                        # FIXME: Use the column's license field instead of hardcoding.
                        license=licenses.get_license(le_licenses.CC_BY, copyright_holder=get_column(item, "copyright")),
                        # thumbnail=get_column(item, "thumbnail")
                    )
                    subject_node.add_child(video)

                class_node.add_child(subject_node)


            channel.add_child(class_node)
def fetch_video(video):
    youtube_id = video['id']
    title = video['title']
    description = video['description']
    youtube_url = video['webpage_url']
    subtitle_languages = video['subtitles'].keys()

    print("    Fetching video data: %s (%s)" % (title, youtube_url))

    video_node = nodes.VideoNode(
        source_id=youtube_id,
        title=truncate_metadata(title),
        license=LICENSE,
        description=truncate_description(description),
        derive_thumbnail=True,
        language="en",
        files=[files.YouTubeVideoFile(youtube_id=youtube_id)],
    )

    # Add subtitles in whichever languages are available.
    for language in subtitle_languages:
        if getlang_patched(language):
            video_node.add_file(LanguagePatchedYouTubeSubtitleFile(
                youtube_id=youtube_id, youtube_language=language))

    return video_node
Exemple #4
0
def scrape_directory(topic, directory, indent=1):
    for subdirectory, folders, myfiles in os.walk(directory):

        # Go through all of the folders under directory
        for folder in folders:
            print('{}{}'.format('    ' * indent, folder))
            subtopic = nodes.TopicNode(source_id=folder, title=folder)
            topic.add_child(subtopic)

            # Go through folders under directory
            scrape_directory(subtopic,
                             os.sep.join([subdirectory, folder]),
                             indent=indent + 1)
        for file in myfiles:
            name, ext = os.path.splitext(file)
            if ext == '.mp4':
                video = nodes.VideoNode(source_id=subdirectory + file,
                                        title=name,
                                        license=LICENSE,
                                        copyright_holder=COPYRIGHT_HOLDER)
                videofile = files.VideoFile(os.sep.join([subdirectory, file]))
                video.add_file(videofile)
                topic.add_child(video)
            elif ext == '.pdf':
                with PDFParser(os.path.sep.join([subdirectory,
                                                 file])) as parser:
                    chapters = parser.get_data_file()
                    generate_pdf_nodes(chapters,
                                       topic,
                                       source=os.path.basename(file))
        break
Exemple #5
0
def scrape_collection_files(topic, url):
    assets = json.loads(downloader.read(url))['data']
    images = []
    for asset in assets:
        if asset['attributes']['extension'] == 'png':
            images.append({
                'url':
                asset['attributes']['thumbnail_url'].replace(
                    'element.png', '*****@*****.**'),
                'caption':
                asset['attributes']['name']
            })

        elif asset['attributes']['extension'] == 'mp4':
            video_data = json.loads(
                downloader.read(FILE_STORAGE_URL.format(id=asset['id'])))
            video = video_data['data'][0]['attributes']
            topic.add_child(
                nodes.VideoNode(source_id=video['url'],
                                title=asset['attributes']['name'],
                                license=LICENSE,
                                files=[
                                    files.VideoFile(video['url']),
                                    files.ThumbnailFile(video['thumbnail_url'])
                                ]))
        else:
            LOGGER.warning('Unable to add {} from {}'.format(
                asset['attributes']['extension'], url))

    # Add images to slideshow node
    if len(images):
        topic.add_child(create_slideshow(images, url, topic.title, 'English'))
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode

        Healing Classrooms is organized with the following hierarchy:
            Playlist (TopicNode)
            |   Youtube Video (VideoNode)
            |   Youtube Video (VideoNode)

        """
        channel = self.get_channel(*args, **kwargs)  # Create ChannelNode from data in self.channel_info

        # Download the playlist/video information
        with youtube_dl.YoutubeDL({'skip_download': True}) as ydl:
            info_dict = ydl.extract_info(PLAYLISTS_URL, download=False)

            # Generate topics based off playlist entries in dict
            for playlist in info_dict['entries']:

                # Get language of playlist (hack)
                language = "fr"
                if "English" in playlist['title']:
                    language = "en"
                elif "Arabic" in playlist['title']:
                    language = "ar"

                playlist_topic = nodes.TopicNode(title=playlist['title'], source_id=playlist['id'], language=language)
                channel.add_child(playlist_topic)


                # Generate videos based off video entries in dict
                for video in playlist['entries']:
                    thumbnail_url = len(video['thumbnails']) and video['thumbnails'][0]['url']

                    playlist_topic.add_child(nodes.VideoNode(
                        title = video['title'],
                        source_id = video['id'],
                        license = licenses.PublicDomainLicense(),
                        description = video['description'],
                        derive_thumbnail = not thumbnail_url,
                        files = [files.WebVideoFile(video['webpage_url'])],
                        thumbnail = thumbnail_url,
                        author = AUTHOR,
                        # tags = video['categories'] + video['tags'], # TODO: uncomment this when added
                    ))

        raise_for_invalid_channel(channel)  # Check for errors in channel construction

        return channel
 def to_contentnode(self, title, directory=None, *args, **kwargs):
     # Generate a node based on the kind attribute
     filepath = self.to_file(directory=directory)
     if self.kind == content_kinds.HTML5:
         return nodes.HTML5AppNode(source_id=self.url,
                                   title=title,
                                   files=[files.HTMLZipFile(filepath)],
                                   **kwargs)
     elif self.kind == content_kinds.VIDEO:
         return nodes.VideoNode(source_id=self.url,
                                title=title,
                                files=[files.VideoFile(filepath)],
                                **kwargs)
def scrape_iversity(channel):
    url = "{}/en/my/courses/rethinking-us-them-integration-and-diversity-in-europe/lesson_units".format(
        BASE_URL)
    LOGGER.info("   Scraping Migration Matters at {}".format(url))
    source = read_source(url)
    chapters = source.find_all('div', {'class': 'chapter-units-wrapper'})

    for chapter in chapters:
        title = str(chapter.find('div', {'class': 'chapter-title'}).string)
        source_id = title.strip().replace(" ", "_")
        topic = nodes.TopicNode(source_id=source_id, title=title)
        lessons = chapter.find_all('a', {'class': 'unit-wrapper'})

        for lesson in lessons:
            video_exists = lesson.find('i', {'class': 'unit_video'})
            video_title = str(
                lesson.find('span', {
                    'class': 'unit-title'
                }).string).strip()

            if video_exists:
                video_source_id = video_title.replace(" ", "_")
                video_url = "{}{}".format(BASE_URL, lesson.attrs["href"])
                video_source = read_source(video_url)
                video_info = video_source.find('video')
                video_subtitle_path = video_info.find('track', {
                    'kind': 'subtitles'
                }).attrs["src"]
                video_subtitle = files.SubtitleFile(
                    path=video_subtitle_path,
                    language=languages.getlang('en').code)
                video_link = video_info.find('source', {
                    'res': '480'
                }).attrs["src"]
                video_file = files.VideoFile(
                    path=video_link, language=languages.getlang('en').code)
                video_node = nodes.VideoNode(
                    source_id=video_source_id,
                    title=video_title,
                    files=[video_file, video_subtitle],
                    license=CHANNEL_LICENSE,
                    copyright_holder=COPYRIGHT_HOLDER)
                LOGGER.info("   Uploading video - {}".format(
                    video_title.strip()))
                topic.add_child(video_node)
            else:
                LOGGER.info(
                    "Format of the file is not supported by the sushi chef : {}"
                    .format(video_title))

        channel.add_child(topic)
Exemple #9
0
def _build_tree(node, sourcetree):
    """
    Parse nodes given in `sourcetree` and add as children of `node`.
    """
    for child_source_node in sourcetree:
        try:
            main_file = child_source_node['files'][
                0] if 'files' in child_source_node else {}
            kind = guess_content_kind(
                path=main_file.get('path'),
                web_video_data=main_file.get('youtube_id')
                or main_file.get('web_url'),
                questions=child_source_node.get("questions"))
        except UnknownContentKindError:
            continue

        if kind == content_kinds.TOPIC:
            child_node = nodes.TopicNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            node.add_child(child_node)

            source_tree_children = child_source_node.get("children", [])

            _build_tree(child_node, source_tree_children)

        elif kind == content_kinds.VIDEO:
            child_node = nodes.VideoNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=get_license(child_source_node.get("license"),
                                    description="Description of license",
                                    copyright_holder=child_source_node.get(
                                        'copyright_holder')),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                derive_thumbnail=True,  # video-specific data
                thumbnail=child_source_node.get('thumbnail'),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        else:  # unknown content file format
            continue

    return node
def scrape_video_collection(url, topic):
    """ Scrape videos under video collection and add to the topic node
        Args:
            url (str): url to video page (e.g. https://www.exploratorium.edu/video/inflatable-jimmy-kuehnle)
            topic (TopicNode): topic to add video nodes to
    """
    try:
        collection_contents = BeautifulSoup(read(url), 'html5lib')
        for result in collection_contents.find_all('div',
                                                   {'class': 'search-result'}):
            header = result.find('div',
                                 {'class': 'views-field-field-html-title'})
            LOGGER.info("            {}".format(header.text.strip()))

            # Get video from given url
            description = result.find('div', {'class': 'search-description'})
            video_contents = BeautifulSoup(read(header.find('a')['href']),
                                           'html.parser')
            for k, v in get_brightcove_mapping(video_contents).items():
                video_node = nodes.VideoNode(
                    source_id=k,
                    title=header.text.strip().replace("’", "'"),
                    description=description.text.strip()
                    if description else "",
                    license=LICENSE,
                    copyright_holder=COPYRIGHT_HOLDER,
                    author=v.get('author') or "",
                    files=[
                        files.WebVideoFile(v['url'], high_resolution=False)
                    ],
                    thumbnail=get_thumbnail_url(result.find('img')['src']),
                )

                # If video doesn't already exist here, add to topic
                if not next((c for c in topic.children
                             if c.source_id == video_node.source_id), None):
                    topic.add_child(video_node)

        # Scrape next page (if any)
        next_page_url = get_next_page_url(collection_contents)
        if next_page_url:
            scrape_video_collection(next_page_url, topic)

    except requests.exceptions.HTTPError:
        LOGGER.error("Could not read collection at {}".format(url))
def download_video_topics(topic_node,
                          playlist_item,
                          lang_obj,
                          use_cache=True,
                          to_sheet=False):
    """
    Scrape, collect, and download the videos from playlist.
    """
    playlist_obj = RefugeeResponsePlaylist(playlist_item, use_cache)
    playlist_info = playlist_obj.get_playlist_info()
    videos = [entry['id'] for entry in playlist_info.get('children')]
    for video in playlist_info.get('children'):
        video_id = video['id']
        video_url = YOUTUBE_VIDEO_URL_FORMAT.format(video_id)
        video_source_id = 'refugee-response-{0}-{1}'.format(
            lang_obj.name, video_id)
        if video_id in VIDEO_DESCRIPTION_MAP:
            video_description = VIDEO_DESCRIPTION_MAP[video_id]
        else:
            # Exclude videos
            continue
        LOGGER.info("Video Description: '%s'", video_description)
        try:
            video_node = nodes.VideoNode(
                source_id=video_source_id,
                title=video['title'],
                description=video_description,
                author=REFUGEE_RESPONSE,
                language=lang_obj.code,
                provider=REFUGEE_RESPONSE,
                thumbnail=video['thumbnail'],
                license=licenses.get_license(
                    "CC BY-NC-ND", copyright_holder=REFUGEE_RESPONSE),
                files=[
                    files.YouTubeVideoFile(youtube_id=video_id,
                                           language=lang_obj.code)
                ])
            topic_node.add_child(video_node)
        except Exception as e:
            LOGGER.error('Error downloading this video: %s', e)
def make_content_node(kind, source_id, title, license, filepath, optionals):
    """
    Create `kind` subclass of ContentNode based on required args and optionals.
    """
    content_node = None
    if kind == content_kinds.VIDEO:
        content_node = nodes.VideoNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            derive_thumbnail=True, # video-specific data
            files=[files.VideoFile(path=filepath)],
        )

    elif kind == content_kinds.AUDIO:
        content_node = nodes.AudioNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            thumbnail=optionals.get("thumbnail", None),
            files=[files.AudioFile(path=filepath)],
        )

    elif kind == content_kinds.DOCUMENT:
        content_node = nodes.DocumentNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            thumbnail=optionals.get("thumbnail", None),
            files=[files.DocumentFile(path=filepath)],
        )

    return content_node
Exemple #13
0
def scrape_content(endpoint, channel, existingNode=None):
    replacements = {" ": "%20", "#": "%23"}
    content = read_source(endpoint)
    attributes = content.find("tbody").find_all("td", "text-xs-left")

    for attribute in attributes:
        source_id = attribute.attrs["data-sort-value"]

        # Check if it is mp4 file
        if source_id.endswith(".mp4"):
            video_info = attribute.find("a")
            video_title, _ext = splitext(str(video_info.string))
            filter_video_link = video_info.attrs["href"][1:].replace(
                " ", "%20")
            video_link = BASE_URL + filter_video_link
            video_file = files.VideoFile(path=video_link)
            video_node = nodes.VideoNode(source_id=source_id,
                                         title=video_title,
                                         files=[video_file],
                                         license=CHANNEL_LICENSE)
            existingNode.add_child(video_node)

        # Check if it is a directory
        elif source_id.startswith("dir"):
            title = str(attribute.find("strong").string)
            topic_node = nodes.TopicNode(source_id=source_id, title=title)
            if existingNode:
                existingNode.add_child(topic_node)
            else:
                channel.add_child(topic_node)

            new_end_point = replace_all(title, replacements)
            new_end = endpoint + "{}/".format(new_end_point)
            scrape_content(new_end, channel, topic_node)
        else:
            LOGGER.info(
                "Format of the file is not supported by the sushi chef : {}".
                format(source_id))
Exemple #14
0
    def video_node_from_dropbox(self, video_details, link, token):
        dbx = dropbox.Dropbox(token)

        metadata, res = dbx.sharing_get_shared_link_file(url=link)
        # get relative path to video file
        video_path = os.path.relpath(os.path.join(VIDEO_FOLDER, metadata.name))

        if not os.path.isfile(video_path):
            with open(video_path, 'wb') as f:
                f.write(res.content)
        else:
            LOGGER.info("{} already downloaded. Skipping".format(
                metadata.name))

        video_file = files.VideoFile(path=video_path)

        video_node = nodes.VideoNode(
            title=video_details["title"],
            source_id=link,
            license=licenses.CC_BYLicense("TicTacLearn"),
            files=[video_file])

        return video_node
def _build_tree(node, sourcetree):
    """
    Parse nodes given in `sourcetree` and add as children of `node`.
    """
    for child_source_node in sourcetree:
        try:
            main_file = child_source_node['files'][0] if 'files' in child_source_node else {}
            kind = guess_content_kind(path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions"))
        except UnknownContentKindError:
            continue

        if kind == content_kinds.TOPIC:
            child_node = nodes.TopicNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            node.add_child(child_node)

            source_tree_children = child_source_node.get("children", [])

            _build_tree(child_node, source_tree_children)

        elif kind == content_kinds.VIDEO:
            child_node = nodes.VideoNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=get_license(child_source_node.get("license"), description="Description of license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                derive_thumbnail=True, # video-specific data
                thumbnail=child_source_node.get('thumbnail'),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        elif kind == content_kinds.AUDIO:
            child_node = nodes.AudioNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        elif kind == content_kinds.DOCUMENT:
            child_node = nodes.DocumentNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        elif kind == content_kinds.EXERCISE:
            child_node = nodes.ExerciseNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                exercise_data={}, # Just set to default
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            for q in child_source_node.get("questions"):
                question = create_question(q)
                child_node.add_question(question)
            node.add_child(child_node)

        elif kind == content_kinds.HTML5:
            child_node = nodes.HTML5AppNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        else:                   # unknown content file format
            continue

    return node
Exemple #16
0
def build_tree_from_json(parent_node, sourcetree):
    """
    Recusively parse nodes in the list `sourcetree` and add them as children
    to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`.
    """
    EXPECTED_NODE_TYPES = [
        TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, DOCUMENT_NODE,
        HTML5_NODE
    ]

    for source_node in sourcetree:
        kind = source_node['kind']
        if kind not in EXPECTED_NODE_TYPES:
            LOGGER.critical('Unexpected node type found: ' + kind)
            raise NotImplementedError(
                'Unexpected node type found in json data.')

        if kind == TOPIC_NODE:
            child_node = nodes.TopicNode(
                source_id=source_node.get("source_id", None),
                title=source_node["title"],
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
            )
            parent_node.add_child(child_node)
            source_tree_children = source_node.get("children", [])
            build_tree_from_json(child_node, source_tree_children)

        elif kind == VIDEO_NODE:
            child_node = nodes.VideoNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                derive_thumbnail=source_node.get(
                    'derive_thumbnail', True),  # video-specific option
                thumbnail=source_node.get('thumbnail'),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        elif kind == AUDIO_NODE:
            child_node = nodes.AudioNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get('thumbnail'),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        elif kind == EXERCISE_NODE:
            child_node = nodes.ExerciseNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
                questions=[],
            )
            add_questions(child_node, source_node.get("questions") or [])
            parent_node.add_child(child_node)

        elif kind == DOCUMENT_NODE:
            child_node = nodes.DocumentNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        elif kind == HTML5_NODE:
            child_node = nodes.HTML5AppNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        else:
            LOGGER.critical("Encountered an unknown kind: " + str(source_node))
            continue

    return parent_node
    def add_video_nodes_from_playlist(
            self,
            youtube_client,
            playlist_id,
            subtitle_languages=SUBTITLE_LANGUAGES,
            copyright_holder=COPYRIGHT_HOLDER,
            only_creative_commons=ONLY_CREATIVE_COMMONS):
        first_page = True
        next_page_token = None
        playlist_request_kwargs = {
            'part': 'contentDetails',
            'maxResults': 50,
            'playlistId': playlist_id,
        }

        # Apparently the same video is in one of the playlists twice!
        # This is used to keep track of videos that have already been added.
        videos_added = {}
        while first_page or next_page_token:
            first_page = False  # we're visiting the first page now!
            playlist_info = youtube_client.playlistItems().list(
                **playlist_request_kwargs).execute()
            playlist_items = playlist_info['items']

            video_ids = [
                vid['contentDetails']['videoId'] for vid in playlist_items
            ]
            videos = youtube_client.videos().list(
                part='status,snippet',
                id=','.join(video_ids)).execute()['items']

            # Apparently the same video is in one of the playlists twice!
            # Uncomment the following code to see for yourself:

            # video_ids = [v['id'] for v in videos]
            # duplicated_videos = [v for v in video_ids if video_ids.count(v) > 1]
            # print("The following videos are duplicated: {}".format(duplicated_videos))

            for video in videos:
                if video['id'] in videos_added:
                    continue
                if only_creative_commons and video['status'][
                        'license'] != 'creativeCommon':
                    print(
                        "The video '{}' is not licensed as Creative Commons... it is licensed as {}"
                        .format(video['snippet']['title'],
                                video['status']['license']))
                else:
                    try:
                        video_license = licenses.CC_BY \
                                        if video['status']['license'] == 'creativeCommon' \
                                        else NON_CREATIVE_COMMONS_LICENSE_DEFAULT
                        video_node = nodes.VideoNode(
                            # source_id="{}__{}".format(video['id'], playlist_id),
                            source_id=video['id'],
                            title=video['snippet']['title'],
                            language=CHANNEL_LANGUAGE,
                            license=get_license(
                                video_license,
                                copyright_holder=copyright_holder),
                            thumbnail=get_largest_thumbnail(
                                video['snippet']['thumbnails']).get('url'),
                            files=[
                                files.YouTubeVideoFile(video['id']),
                            ])

                        # Get subtitles for languages designated in SUBTITLE_LANGUAGES
                        for lang_code in subtitle_languages:
                            if files.is_youtube_subtitle_file_supported_language(
                                    lang_code):
                                video_node.add_file(
                                    files.YouTubeSubtitleFile(
                                        youtube_id=video['id'],
                                        language=lang_code))
                            else:
                                print('Unsupported subtitle language code:',
                                      lang_code)

                        self.add_child(video_node)
                        videos_added[video['id']] = video_node
                    except Exception as e:
                        raise e

            # set up the next page, if there is one
            next_page_token = playlist_info.get('nextPageToken')
            if next_page_token:
                playlist_request_kwargs['pageToken'] = next_page_token
            else:
                try:
                    del playlist_request_kwargs['pageToken']
                except Exception as e:
                    pass
def scrape_content(title, content_url):
    """
    title: Boys' clothing
    content_url: http://www.touchableearth.org/china-culture-boys-clothing/
    """
    print("    Scraping content node: %s (%s)" % (title, content_url))

    doc = get_parsed_html_from_url(content_url)
    if not doc:  # 404
        return None

    description = create_description(doc)
    source_id = doc.select_one(".current_post.active .post_id")["value"]

    base_node_attributes = {
        "source_id": source_id,
        "title": title,
        "license": TE_LICENSE,
        "description": description,
    }

    youtube_iframe = doc.select_one(".video-container iframe")
    if youtube_iframe:
        youtube_url = doc.select_one(".video-container iframe")["src"]
        youtube_id = get_youtube_id_from_url(youtube_url)

        if not youtube_id:
            print("    *** WARNING: youtube_id not found for content url",
                  content_url)
            print("    Skipping.")
            return None

        try:
            info = ydl.extract_info(youtube_url, download=False)
            subtitles = info.get("subtitles")
            subtitle_languages = subtitles.keys() if subtitles else []
            print("      ... with subtitles in languages:", subtitle_languages)
        except youtube_dl.DownloadError as e:
            # Some of the videos have been removed from the YouTube channel --
            # skip creating content nodes for them entirely so they don't show up
            # as non-loadable videos in Kolibri.
            print("        NOTE: Skipping video download due to error: ", e)
            return None

        video_node = nodes.VideoNode(
            **base_node_attributes,
            derive_thumbnail=True,
            files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)],
        )

        # Add subtitles in whichever languages are available.
        for language in subtitle_languages:
            video_node.add_file(
                files.YouTubeSubtitleFile(youtube_id=youtube_id,
                                          language=language))

        return video_node

    img = doc.select_one(".uncode-single-media-wrapper img")
    if img:
        img_src = img["data-guid"] or img["src"]
        destination = tempfile.mkdtemp()
        download_file(img_src,
                      destination,
                      request_fn=make_request,
                      filename="image.jpg")

        with open(os.path.join(destination, "index.html"), "w") as f:
            f.write("""
                <!doctype html>
                <html>
                <head></head>
                <body>
                    <img src="image.jpg" style="width: 100%; max-width: 1200px;" />
                </body>
                </html>
            """)

        zip_path = create_predictable_zip(destination)

        return nodes.HTML5AppNode(
            **base_node_attributes,
            files=[files.HTMLZipFile(zip_path)],
            thumbnail=img_src,
        )

    return None
Exemple #19
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode
        """
        channel = self.get_channel(*args, **kwargs)  # Create ChannelNode from data in self.channel_info
        
        from apiclient.discovery import build
        # instantiate a YouTube Data API v3 client
        youtube = build('youtube', 'v3', developerKey=kwargs['--youtube-api-token'])
        playlists = youtube.playlists().list( # list all of the YouTube channel's playlists
            part='snippet',
            channelId=YOUTUBE_CHANNEL_ID,
            maxResults=50
        ).execute()['items']

        # For getting the thumbnail automatically
        
        # youtube_channel = youtube.channels().list(
        #     id=YOUTUBE_CHANNEL_ID,
        #     part='snippet'
        # ).execute()['items'][0]

        # channel.thumbnail = get_largest_thumbnail(youtube_channel['snippet']['thumbnails']).get('url')

        for playlist in playlists:
            topic = nodes.TopicNode(title=playlist['snippet']['title'], source_id=playlist['id'])
            first_page = True
            next_page_token = None
            playlist_request_kwargs = {
                'part': 'contentDetails',
                'maxResults': 50,
                'playlistId': playlist['id'],
            }

            while first_page or next_page_token:
                first_page = False # we're visiting the first page now!
                playlist_info = youtube.playlistItems().list(**playlist_request_kwargs).execute()
                playlist_items = playlist_info['items']

                video_ids = [vid['contentDetails']['videoId'] for vid in playlist_items]
                videos = youtube.videos().list(
                    part='status,snippet',
                    id=','.join(video_ids)
                ).execute()['items']

                for video in videos:
                    if video['status']['license'] == 'creativeCommon':
                        try:
                            video_node = nodes.VideoNode(
                                source_id=video['id'],
                                title=video['snippet']['title'],
                                language=CHANNEL_LANGUAGE,
                                license=get_license(licenses.CC_BY, copyright_holder='Espresso English'),
                                thumbnail=get_largest_thumbnail(video['snippet']['thumbnails']).get('url'),
                                files=[
                                    files.YouTubeVideoFile(video['id']),
                                ]
                            )

                            topic.add_child(video_node)
                            
                            # Get subtitles for languages designated in SUBTITLE_LANGUAGES
                            for lang_code in SUBTITLE_LANGUAGES:
                                if files.is_youtube_subtitle_file_supported_language(lang_code):
                                    video_node.add_file(
                                        files.YouTubeSubtitleFile(
                                            youtube_id=video['id'],
                                            language=lang_code
                                        )
                                    )
                                else:
                                    print('Unsupported subtitle language code:', lang_code)

                        except Exception as e:
                            raise e
                
                # set up the next page, if there is one
                next_page_token = playlist_info.get('nextPageToken')
                if next_page_token:
                    playlist_request_kwargs['pageToken'] = next_page_token
                else:
                    try:
                        del playlist_request_kwargs['pageToken']
                    except Exception as e:
                        pass

            channel.add_child(topic)

        raise_for_invalid_channel(channel)  # Check for errors in channel construction

        return channel
def convert_ka_node_to_ricecooker_node(ka_node):

    if ka_node.slug in SLUG_BLACKLIST:
        return None

    if isinstance(ka_node, KhanTopic):
        topic = nodes.TopicNode(
            source_id=ka_node.id,
            title=ka_node.title,
            description=ka_node.description[:400],
        )
        for ka_subtopic in ka_node.children:
            subtopic = convert_ka_node_to_ricecooker_node(ka_subtopic)
            if subtopic:
                topic.add_child(subtopic)
        return topic

    elif isinstance(ka_node, KhanExercise):
        exercise = nodes.ExerciseNode(
            source_id=ka_node.id,
            title=ka_node.title,
            description=ka_node.description[:400],
            # exercise_data={'mastery_model': node.get('suggested_completion_criteria')},
            license=licenses.SpecialPermissionsLicense(
                copyright_holder="Khan Academy",
                description=
                "Permission granted to distribute through Kolibri for non-commercial use"
            ),  # need to formalize with KA
            thumbnail=node.thumbnail,
        )
        for ka_assessment_item in ka_node.get_assessment_items():
            assessment_item = PerseusQuestion(
                id=assessment_item.id,
                raw_data=assessment_item.data,
                source_url=assessment_item.source_url,
            )
            exercise.add_question(assessment_item)
        return exercise

    elif isinstance(ka_node, KhanVideo):

        # TODO: Use traditional compression here to avoid breaking existing KA downloads?
        files = [
            VideoFile(
                ka_node.download_urls.get("mp4-low",
                                          ka_node.download_urls.get("mp4")))
        ]

        # if the video is in English, include any subtitles available along with it
        if ka_node.lang == "en":
            for lang_code in ka_node.get_subtitle_languages():
                files.append(YouTubeSubtitleFile(node.id, language=lang_code))

        # convert KA's license format into our own license classes
        if ka_node.license in LICENSE_MAPPING:
            license = LICENSE_MAPPING[ka_node.license]
        else:
            # license = licenses.CC_BY_NC_SA # or?
            raise Exception("Unknown license on video {}: {}".format(
                ka_node.id, ka_node.license))

        video = nodes.VideoNode(
            source_id=ka_node.id,
            title=ka_node.title,
            description=ka_node.description[:400],
            license=license,
            thumbnail=node.thumbnail,
            files=files,
        )

        return video

    elif isinstance(ka_node, KhanArticle):
        # TODO
        return None
def _build_tree(node, sourcetree):
    #for child_source_node in sourcetree:
    #d=dict(child_source_node)
    #print(child_source_node)
    #title = child_source_node.replace(u'\xa0', u' ').replace('\n', '')
    #title="none"
    #title=""
    files = ""
    for s in sourcetree:
        print(type(s))
        if s.get('type') == 'file':
            title = str(s.get('name'))
            print("title:")
            print(title)
            files = s.get('files')

        else:
            # if child_source_node=='children':
            #for i in range(len(sourcetree.get('children'))):
            #   _build_tree(node,sourcetree.get('children')[i])
            #print(s)
            child_node = nodes.TopicNode(
                source_id=str(s.get('name')),
                title=str(s.get('name')).replace("_", " "),
            )
            node.add_child(child_node)

            source_tree_children = s.get("children", [])

            _build_tree(child_node, source_tree_children)

    #print("T:", title)
    #path="none"

    #source_id="none"

    #print("S:", source_id)

    #fancy_license = get_license(licenses.SPECIAL_PERMISSIONS, description='gfh', copyright_holder='sed')

    for child_source_node in sourcetree:

        try:
            main_file = child_source_node['files'][
                0] if 'files' in child_source_node else {}
            kind = guess_content_kind(
                path=main_file.get('path'),
                web_video_data=main_file.get('youtube_id')
                or main_file.get('web_url'))
        except UnknownContentKindError:
            continue
        print("kind:")
        print(kind)

        # if kind == content_kinds.TOPIC:
        #     child_node = nodes.TopicNode(
        #         source_id=str(uuid.uuid4()),
        #         title=str(child_source_node.get('name'))
        #     )
        #     node.add_child(child_node)

        # source_tree_children = child_source_node.get("children", [])

        # _build_tree(child_node, source_tree_children)

        if kind == content_kinds.VIDEO:
            child_node = nodes.VideoNode(
                # source_id=str(uuid.uuid4()),
                source_id=str(child_source_node.get('name')).replace(' ', '_'),
                title=str(child_source_node.get('name').replace(".mp4", "")),
                license='All Rights Reserved',
                copyright_holder="Sarva Shiksha Abhiyaan",
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        else:  # unknown content file format
            continue

    return node
def crawl_each_post(post_url):
    resp = requests.get(post_url, headers=HEADERS)
    soup = BeautifulSoup(resp.content, "html.parser")
    wrapper = soup.find('div', {'class': 'wpb_wrapper'})
    course_name = wrapper.find('div', {
        'class': 'vc_custom_heading'
    }).getText().strip()
    delimiters = " OF ", " FROM "
    regex_pattern = '|'.join(map(re.escape, delimiters))
    course = re.split(regex_pattern, course_name)[1]
    wpb_video_wrapper = wrapper.find_all('div', {'class': 'wpb_video_wrapper'})

    if wpb_video_wrapper:

        for each_wrapper in wpb_video_wrapper:
            video_url = each_wrapper.find('iframe').attrs["src"].split(
                "?feature")[0]
            video_id = video_url.split("/")[-1]

            ydl = youtube_dl.YoutubeDL({
                'outtmpl': './downloads/%(id)s.%(ext)s',
                'writeautomaticsub': True,
                'logger': LOGGER
            })

            with ydl:
                result = ydl.extract_info(
                    "http://www.youtube.com/watch?v={}".format(video_id),
                    download=True)
            if 'entries' in result:
                video = result['entries'][0]
            else:
                video = result

            video_title = video["title"]
            video_source_id = video_title.strip().replace(" ", "_")
            video_path = "{}/{}.mp4".format(DOWNLOAD_DIRECTORY, video_id)
            video_subtitle_path = "{}/{}.en.vtt".format(
                DOWNLOAD_DIRECTORY, video_id)
            video_file = files.VideoFile(path=video_path,
                                         language=languages.getlang('en').code)
            video_subtitle = files.SubtitleFile(
                path=video_subtitle_path,
                language=languages.getlang('en').code)
            video_node = nodes.VideoNode(
                source_id=video_source_id,
                title=video_title,
                files=[video_file, video_subtitle],
                license=CHANNEL_LICENSE,
                copyright_holder=COPYRIGHT_HOLDER,
            )

            if course not in EPISODE_DICT:
                EPISODE_DICT[course] = [video_node]
            else:
                EPISODE_DICT[course].append(video_node)
            LOGGER.info("   Uploading video - {}".format(video_title.strip()))
    else:
        LOGGER.info(
            "Format of the file is not supported by the sushi chef : {}".
            format(course_name))
Exemple #23
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in on the command line
          - kwargs: extra options passed in as key="value" pairs on the command line
            For example, add the command line option   lang="fr"  and the value
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode
        """
        # editing metadata
        for key, value in kwargs.items():
            if key == NO_CACHE_KEYNAME:
                self.use_cache = False
                LOGGER.info("use_cache = '%d'", self.use_cache)
            if key == EXTRACT_VIDEO_INFO:
                self.insert_video_info = True
                self.video_list = value.split(",")
            if key == EXTRACT_VIDEO_PLAYLIST_INFO:
                self.insert_video_info = True
                self.to_playlist = value
            if key == DOWNLOAD_TO_CSV:
                if value == "true":
                    print('csv = true')
                    create_csv()
                    exit(0)

        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info
        # Get Channel Topics

        # Create thumbnails folder in chefdata if not exists
        if not os.path.isdir(os.path.join('chefdata', 'thumbnails')):
            os.makedirs(os.path.join('chefdata', 'thumbnails'))

        # youtube_cache = os.path.join("chefdata", "youtubecache")

        for playlist_id in PLAYLIST_MAP:

            playlist = YouTubePlaylistUtils(id=playlist_id,
                                            cache_dir=YOUTUBE_CACHE_DIR)

            playlist_info = playlist.get_playlist_info(use_proxy=False)

            # Get channel description if there is any
            playlist_description = ''
            if playlist_info["description"]:
                playlist_description = playlist_info["description"]
            else:
                playlist_description = playlist_info["title"]

            topic_source_id = 'aimhi-child-topic-{0}'.format(
                playlist_info["title"])
            topic_node = nodes.TopicNode(title=playlist_info["title"],
                                         source_id=topic_source_id,
                                         author="AimHi",
                                         provider="AimHi",
                                         description=playlist_description,
                                         language="en")

            video_ids = []

            # insert videos into playlist topic after creation
            for child in playlist_info["children"]:
                # check for duplicate videos
                if child["id"] not in video_ids:
                    video = YouTubeVideoUtils(id=child["id"], cache_dir=False)
                    video_details = video.get_video_info(use_proxy=False)
                    video_source_id = "AimHi-{0}-{1}".format(
                        playlist_info["title"], video_details["id"])

                    # Check youtube thumbnail extension as some are not supported formats
                    thumbnail_link = ''
                    print(video_details["thumbnail"])
                    image_response = requests.get("{0}".format(
                        video_details["thumbnail"]))

                    img = Image.open(BytesIO(image_response.content))
                    if img.format not in ['JPG', 'PNG', 'JPEG']:
                        # if not in correct format, convert image and download to files folder
                        print(video_details["thumbnail"])
                        print("{0}'s thumbnail not supported ({1}).".format(
                            video_details["id"], img.format))
                        img_file_name = '{}_thumbnail.jpg'.format(
                            video_details["id"])
                        thumbnail_link = os.path.join('chefdata', 'thumbnails',
                                                      img_file_name)

                        jpg_img = img.convert("RGB")

                        # resive image to thumbnail dimensions
                        jpg_img = jpg_img.resize((400, 225), Image.ANTIALIAS)
                        jpg_img.save(thumbnail_link)
                    else:
                        thumbnail_link = video_details["thumbnail"]

                    print(thumbnail_link)
                    video_node = nodes.VideoNode(
                        source_id=video_source_id,
                        title=video_details["title"],
                        description=video_details["description"],
                        author="AimHi",
                        language="en",
                        provider="AimHi",
                        thumbnail=thumbnail_link,
                        license=licenses.get_license("CC BY-NC-ND",
                                                     copyright_holder="AimHi"),
                        files=[
                            files.YouTubeVideoFile(
                                youtube_id=video_details["id"], language="en")
                        ])
                    # add video to topic
                    print(video_details["id"] + " has been added!")
                    # add id to video_ids array
                    video_ids.append(video_details["id"])
                    topic_node.add_child(video_node)

                else:
                    continue

            # add topic to channel
            channel.add_child(topic_node)

        return channel
Exemple #24
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode

        Healing Classrooms is organized with the following hierarchy:
            Playlist (TopicNode)
            |   Youtube Video (VideoNode)
            |   Youtube Video (VideoNode)

        """
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        # Download the playlist/video information
        try:
            with youtube_dl.YoutubeDL({'skip_download': True}) as ydl:
                info_dict = ydl.extract_info(PLAYLISTS_URL, download=False)
                print(info_dict.keys())

                # Generate topics based off playlist entries in dict
                #for playlist in info_dict['entries']:

                # Get language of playlist (hack)
                #    language = "fr"
                #    if "English" in playlist['title']:
                #        language = "en"
                #    elif "Arabic" in playlist['title']:
                language = "ar"

                #    playlist_topic = nodes.TopicNode(title=playlist['title'], source_id=playlist['id'], language=language)
                #    channel.add_child(playlist_topic)

                # Generate videos based off video entries in dict
                videos = sorted(
                    info_dict['entries'],
                    key=lambda x: int(re.search("\d+", x['title']).group()))
                print([v['title'] for v in videos])
                import time
                time.sleep(15)
                for video in videos:
                    #try:
                    #    num, = re.findall("\d+",video['title'])
                    #    title = re.sub(video['title'], num, "")
                    #    title = ("0"+num)[-2:] + " " + title
                    #except Exception as e:
                    #    print (e)
                    #    print (video['title'])
                    #    print (repr(video['title']))
                    #    raise
                    thumbnail_url = len(
                        video['thumbnails']) and video['thumbnails'][0]['url']

                    channel.add_child(
                        nodes.VideoNode(
                            title=video['title'],
                            source_id=video['id'],
                            license=licenses.PublicDomainLicense(),
                            description=video['description'],
                            derive_thumbnail=not thumbnail_url,
                            files=[files.WebVideoFile(video['webpage_url'])],
                            thumbnail=thumbnail_url,
                            author=AUTHOR,
                            # tags = video['categories'] + video['tags'], # TODO: uncomment this when added
                        ))
        except Exception as e:
            import traceback, sys
            traceback.print_exc(file=sys.stdout)
            raise

        raise_for_invalid_channel(
            channel)  # Check for errors in channel construction

        return channel
Exemple #25
0
def build_tree_from_json(parent_node, sourcetree):
    """
    Recusively parse nodes in the list `sourcetree` and add them as children
    to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`.
    """
    EXPECTED_NODE_TYPES = [TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE,
                           DOCUMENT_NODE, HTML5_NODE, SLIDESHOW_NODE]

    for source_node in sourcetree:
        kind = source_node['kind']
        if kind not in EXPECTED_NODE_TYPES:
            LOGGER.critical('Unexpected node kind found: ' + kind)
            raise NotImplementedError('Unexpected node kind found in json data.')

        if kind == TOPIC_NODE:
            child_node = nodes.TopicNode(
                source_id=source_node.get('source_id', None),
                title=source_node['title'],
                description=source_node.get('description'),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                # no role for topics (computed dynaically from descendants)
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            parent_node.add_child(child_node)
            source_tree_children = source_node.get('children', [])
            build_tree_from_json(child_node, source_tree_children)

        elif kind == VIDEO_NODE:
            child_node = nodes.VideoNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                derive_thumbnail=source_node.get('derive_thumbnail', True),  # video-specific option
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == AUDIO_NODE:
            child_node = nodes.AudioNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == EXERCISE_NODE:
            child_node = nodes.ExerciseNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
                exercise_data=source_node.get('exercise_data'),
                questions=[],
            )
            add_questions(child_node, source_node.get('questions') or [])
            parent_node.add_child(child_node)

        elif kind == DOCUMENT_NODE:
            child_node = nodes.DocumentNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == HTML5_NODE:
            child_node = nodes.HTML5AppNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == SLIDESHOW_NODE:
            child_node = nodes.SlideshowNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags')
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        else:
            LOGGER.critical('Encountered an unknown kind: ' + str(source_node))
            continue

    return parent_node
Exemple #26
0
def download_content_node(category_node,
                          url,
                          title,
                          thumbnail=None,
                          description=None):
    doc = get_parsed_html_from_url(url)

    destination = tempfile.mkdtemp()
    doc = download_static_assets(doc,
                                 destination,
                                 'https://k12.thoughtfullearning.com',
                                 request_fn=make_request,
                                 url_blacklist=url_blacklist)

    remove_node(doc, '#header')
    remove_node(doc, '.subMenuBarContainer')
    remove_node(doc, '.breadbookmarkcontainer')
    remove_node(doc, '.resourcePageTypeTitle')
    remove_node(doc, '.sharethis-wrapper')
    remove_node(doc, '.ccBlock')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block-1')
    remove_node(doc, '#block-views-resource-info-block-block')
    remove_node(doc, '.productSuggestionContainer')
    remove_node(doc, 'footer')

    # For minilessons
    remove_node(doc, '.field-name-field-minilesson-downloadables')

    # For writing assessments
    remove_node(doc, '.assessmentTGLink')
    remove_node(doc, '.assessmentModelRubrics')
    remove_node(doc, '.view-display-id-attachment_1')

    # Write out the HTML source.
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

    print("    ... downloaded to %s" % destination)
    #preview_in_browser(destination)

    thumbnail_path = None
    if thumbnail:
        # Manually download the thumbnail and use it so we can lowercase the
        # extension to be accepted by Ricecooker.
        thumbnail_filename = derive_filename(thumbnail)
        thumbnail_path = os.path.join(destination, thumbnail_filename)
        download_file(thumbnail,
                      destination,
                      request_fn=make_request,
                      filename=thumbnail_filename)

    # If there is an embedded video in the page source grab it as a video node.
    video_node = None
    iframe = doc.select_one('.embedded-video iframe')
    if iframe:
        youtube_url = iframe['src']
        youtube_id = get_youtube_id_from_url(youtube_url)
        info = ydl.extract_info(youtube_url, download=False)
        video_title = info['title']
        print(
            "    ... and with video titled %s from www.youtube.com/watch?v=%s"
            % (video_title, youtube_id))
        video_node = nodes.VideoNode(
            source_id=youtube_id,
            title=truncate_metadata(info['title']),
            license=licenses.CC_BY_NC_SALicense(
                copyright_holder=truncate_metadata('Thoughtful Learning')),
            description=info['description'],
            language="en",
            derive_thumbnail=True,
            files=[files.YouTubeVideoFile(youtube_id)],
        )
        category_node.add_child(video_node)

    zip_path = create_predictable_zip(destination)
    app_node = nodes.HTML5AppNode(
        source_id=url,
        title=truncate_metadata(title),
        license=licenses.CC_BY_NC_SALicense(
            copyright_holder=truncate_metadata('Thoughtful Learning')),
        description=description,
        thumbnail=thumbnail_path,
        files=[files.HTMLZipFile(zip_path)],
        language="en",
    )

    category_node.add_child(app_node)