Beispiel #1
0
    def add_content_to_tree(self, channel):
        tree = self.channel_tree
        lang = 'English'
        lang_obj = getlang("en")
        for class_name in tree[lang]:
            class_obj = tree[lang][class_name]
            class_id = "{}-{}".format(lang, class_name)
            class_node = nodes.TopicNode(source_id=class_name, title=class_name)
            for subject_name in class_obj:
                subject_id = "{}-{}".format(class_id, subject_name)
                subject_node = nodes.TopicNode(source_id=subject_id, title=subject_name)
                subject_obj = class_obj[subject_name]
                for item in subject_obj['items']:
                    item_id = "{}-{}".format(subject_id, get_column(item, 'id'))
                    video = nodes.VideoNode(
                        source_id=item_id,
                        title=get_column(item, 'name'),
                        description=get_column(item, 'description'),
                        files=[
                            files.VideoFile(path=get_column(item, 'file'))
                        ],
                        language=lang_obj,
                        # FIXME: Use the column's license field instead of hardcoding.
                        license=licenses.get_license(le_licenses.CC_BY, copyright_holder=get_column(item, "copyright")),
                        # thumbnail=get_column(item, "thumbnail")
                    )
                    subject_node.add_child(video)

                class_node.add_child(subject_node)


            channel.add_child(class_node)
def add_files(node, file_list):
    for f in file_list:

        path = f.get('path')
        if path is not None:
            abspath = get_abspath(path)      # NEW: expand  content://  -->  ./content/  in file paths
        else:
            abspath = None

        file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding'))

        if file_type == FileTypes.AUDIO_FILE:
            node.add_file(files.AudioFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=abspath))
        elif file_type == FileTypes.DOCUMENT_FILE:
            node.add_file(files.DocumentFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.HTML_ZIP_FILE:
            node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.VIDEO_FILE:
            node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings')))
        elif file_type == FileTypes.SUBTITLE_FILE:
            node.add_file(files.SubtitleFile(path=abspath, language=f['language']))
        elif file_type == FileTypes.BASE64_FILE:
            node.add_file(files.Base64ImageFile(encoding=f['encoding']))
        elif file_type == FileTypes.WEB_VIDEO_FILE:
            node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution')))
        elif file_type == FileTypes.YOUTUBE_VIDEO_FILE:
            node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution')))
            node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en'))
        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
Beispiel #3
0
def scrape_collection_files(topic, url):
    assets = json.loads(downloader.read(url))['data']
    images = []
    for asset in assets:
        if asset['attributes']['extension'] == 'png':
            images.append({
                'url':
                asset['attributes']['thumbnail_url'].replace(
                    'element.png', '*****@*****.**'),
                'caption':
                asset['attributes']['name']
            })

        elif asset['attributes']['extension'] == 'mp4':
            video_data = json.loads(
                downloader.read(FILE_STORAGE_URL.format(id=asset['id'])))
            video = video_data['data'][0]['attributes']
            topic.add_child(
                nodes.VideoNode(source_id=video['url'],
                                title=asset['attributes']['name'],
                                license=LICENSE,
                                files=[
                                    files.VideoFile(video['url']),
                                    files.ThumbnailFile(video['thumbnail_url'])
                                ]))
        else:
            LOGGER.warning('Unable to add {} from {}'.format(
                asset['attributes']['extension'], url))

    # Add images to slideshow node
    if len(images):
        topic.add_child(create_slideshow(images, url, topic.title, 'English'))
Beispiel #4
0
def scrape_directory(topic, directory, indent=1):
    for subdirectory, folders, myfiles in os.walk(directory):

        # Go through all of the folders under directory
        for folder in folders:
            print('{}{}'.format('    ' * indent, folder))
            subtopic = nodes.TopicNode(source_id=folder, title=folder)
            topic.add_child(subtopic)

            # Go through folders under directory
            scrape_directory(subtopic,
                             os.sep.join([subdirectory, folder]),
                             indent=indent + 1)
        for file in myfiles:
            name, ext = os.path.splitext(file)
            if ext == '.mp4':
                video = nodes.VideoNode(source_id=subdirectory + file,
                                        title=name,
                                        license=LICENSE,
                                        copyright_holder=COPYRIGHT_HOLDER)
                videofile = files.VideoFile(os.sep.join([subdirectory, file]))
                video.add_file(videofile)
                topic.add_child(video)
            elif ext == '.pdf':
                with PDFParser(os.path.sep.join([subdirectory,
                                                 file])) as parser:
                    chapters = parser.get_data_file()
                    generate_pdf_nodes(chapters,
                                       topic,
                                       source=os.path.basename(file))
        break
 def to_contentnode(self, title, directory=None, *args, **kwargs):
     # Generate a node based on the kind attribute
     filepath = self.to_file(directory=directory)
     if self.kind == content_kinds.HTML5:
         return nodes.HTML5AppNode(source_id=self.url,
                                   title=title,
                                   files=[files.HTMLZipFile(filepath)],
                                   **kwargs)
     elif self.kind == content_kinds.VIDEO:
         return nodes.VideoNode(source_id=self.url,
                                title=title,
                                files=[files.VideoFile(filepath)],
                                **kwargs)
def scrape_iversity(channel):
    url = "{}/en/my/courses/rethinking-us-them-integration-and-diversity-in-europe/lesson_units".format(
        BASE_URL)
    LOGGER.info("   Scraping Migration Matters at {}".format(url))
    source = read_source(url)
    chapters = source.find_all('div', {'class': 'chapter-units-wrapper'})

    for chapter in chapters:
        title = str(chapter.find('div', {'class': 'chapter-title'}).string)
        source_id = title.strip().replace(" ", "_")
        topic = nodes.TopicNode(source_id=source_id, title=title)
        lessons = chapter.find_all('a', {'class': 'unit-wrapper'})

        for lesson in lessons:
            video_exists = lesson.find('i', {'class': 'unit_video'})
            video_title = str(
                lesson.find('span', {
                    'class': 'unit-title'
                }).string).strip()

            if video_exists:
                video_source_id = video_title.replace(" ", "_")
                video_url = "{}{}".format(BASE_URL, lesson.attrs["href"])
                video_source = read_source(video_url)
                video_info = video_source.find('video')
                video_subtitle_path = video_info.find('track', {
                    'kind': 'subtitles'
                }).attrs["src"]
                video_subtitle = files.SubtitleFile(
                    path=video_subtitle_path,
                    language=languages.getlang('en').code)
                video_link = video_info.find('source', {
                    'res': '480'
                }).attrs["src"]
                video_file = files.VideoFile(
                    path=video_link, language=languages.getlang('en').code)
                video_node = nodes.VideoNode(
                    source_id=video_source_id,
                    title=video_title,
                    files=[video_file, video_subtitle],
                    license=CHANNEL_LICENSE,
                    copyright_holder=COPYRIGHT_HOLDER)
                LOGGER.info("   Uploading video - {}".format(
                    video_title.strip()))
                topic.add_child(video_node)
            else:
                LOGGER.info(
                    "Format of the file is not supported by the sushi chef : {}"
                    .format(video_title))

        channel.add_child(topic)
def make_content_node(kind, source_id, title, license, filepath, optionals):
    """
    Create `kind` subclass of ContentNode based on required args and optionals.
    """
    content_node = None
    if kind == content_kinds.VIDEO:
        content_node = nodes.VideoNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            derive_thumbnail=True, # video-specific data
            files=[files.VideoFile(path=filepath)],
        )

    elif kind == content_kinds.AUDIO:
        content_node = nodes.AudioNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            thumbnail=optionals.get("thumbnail", None),
            files=[files.AudioFile(path=filepath)],
        )

    elif kind == content_kinds.DOCUMENT:
        content_node = nodes.DocumentNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            thumbnail=optionals.get("thumbnail", None),
            files=[files.DocumentFile(path=filepath)],
        )

    return content_node
Beispiel #8
0
def scrape_content(endpoint, channel, existingNode=None):
    replacements = {" ": "%20", "#": "%23"}
    content = read_source(endpoint)
    attributes = content.find("tbody").find_all("td", "text-xs-left")

    for attribute in attributes:
        source_id = attribute.attrs["data-sort-value"]

        # Check if it is mp4 file
        if source_id.endswith(".mp4"):
            video_info = attribute.find("a")
            video_title, _ext = splitext(str(video_info.string))
            filter_video_link = video_info.attrs["href"][1:].replace(
                " ", "%20")
            video_link = BASE_URL + filter_video_link
            video_file = files.VideoFile(path=video_link)
            video_node = nodes.VideoNode(source_id=source_id,
                                         title=video_title,
                                         files=[video_file],
                                         license=CHANNEL_LICENSE)
            existingNode.add_child(video_node)

        # Check if it is a directory
        elif source_id.startswith("dir"):
            title = str(attribute.find("strong").string)
            topic_node = nodes.TopicNode(source_id=source_id, title=title)
            if existingNode:
                existingNode.add_child(topic_node)
            else:
                channel.add_child(topic_node)

            new_end_point = replace_all(title, replacements)
            new_end = endpoint + "{}/".format(new_end_point)
            scrape_content(new_end, channel, topic_node)
        else:
            LOGGER.info(
                "Format of the file is not supported by the sushi chef : {}".
                format(source_id))
Beispiel #9
0
    def video_node_from_dropbox(self, video_details, link, token):
        dbx = dropbox.Dropbox(token)

        metadata, res = dbx.sharing_get_shared_link_file(url=link)
        # get relative path to video file
        video_path = os.path.relpath(os.path.join(VIDEO_FOLDER, metadata.name))

        if not os.path.isfile(video_path):
            with open(video_path, 'wb') as f:
                f.write(res.content)
        else:
            LOGGER.info("{} already downloaded. Skipping".format(
                metadata.name))

        video_file = files.VideoFile(path=video_path)

        video_node = nodes.VideoNode(
            title=video_details["title"],
            source_id=link,
            license=licenses.CC_BYLicense("TicTacLearn"),
            files=[video_file])

        return video_node
Beispiel #10
0
def add_files(node, file_list):
	for f in file_list:
		file_name, file_type = parse_file_name(f)
		print (f)
		if file_type == 'mp3':#FileTypes.AUDIO_FILE:
			node.add_file(files.AudioFile(path=f))
		#elif file_type == FileTypes.THUMBNAIL:
		#	node.add_file(files.ThumbnailFile(path=f['path']))
		elif file_type == 'pdf':#FileTypes.DOCUMENT_FILE:
			node.add_file(files.DocumentFile(path=f))
		#elif file_type == FileTypes.HTML_ZIP_FILE:
		#	node.add_file(files.HTMLZipFile(path=f['path'], language=f.get('language')))
		elif file_type == 'mp4':#FileTypes.VIDEO_FILE:
			node.add_file(files.VideoFile(path=f))
		#elif file_type == FileTypes.SUBTITLE_FILE:
		#	node.add_file(files.SubtitleFile(path=f['path'], language=f['language']))
		#elif file_type == FileTypes.BASE64_FILE:
		#	node.add_file(files.Base64ImageFile(encoding=f['encoding']))
		#elif file_type == FileTypes.WEB_VIDEO_FILE:
		#	node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution')))
		#elif file_type == FileTypes.YOUTUBE_VIDEO_FILE:
		#	node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution')))
		else:
			raise UnknownFileTypeError("Unrecognised file type '{0}'".format(f['path']))
def add_files(node, file_list):
    for f in file_list:
        path = f.get('path')
        if path is not None:
            abspath = get_abspath(
                path
            )  # NEW: expand  content://  -->  ./content/  in file paths
        else:
            abspath = None

        print("kind:" + node.kind.upper())

        file_type = guess_file_type(node.kind, filepath=abspath)

        if file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=abspath))

        elif file_type == FileTypes.VIDEO_FILE:
            node.add_file(
                files.VideoFile(path=abspath, language=f.get('language')))

        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(
                f['path']))
Beispiel #12
0
def add_files(node, file_list):
    EXPECTED_FILE_TYPES = [
        VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE,
        SUBTITLES_FILE
    ]

    for f in file_list:
        file_type = f.get('file_type')
        if file_type not in EXPECTED_FILE_TYPES:
            LOGGER.critical(file_type)
            raise NotImplementedError(
                'Unexpected File type found in channel json.')

        path = f.get('path')  # path can be an URL or a local path (or None)

        # handle different types of files
        if file_type == VIDEO_FILE:
            # handle three types of video files
            if 'youtube_id' in f:
                video_file = files.YouTubeVideoFile(
                    youtube_id=f['youtube_id'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            elif 'web_url' in f:
                video_file = files.WebVideoFile(
                    web_url=f['web_url'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            else:
                video_file = files.VideoFile(
                    path=f['path'],
                    language=f.get('language', None),
                    ffmpeg_settings=f.get('ffmpeg_settings'),
                )
            node.add_file(video_file)

        elif file_type == AUDIO_FILE:
            node.add_file(
                files.AudioFile(path=f['path'],
                                language=f.get('language', None)))

        elif file_type == DOCUMENT_FILE:
            node.add_file(
                files.DocumentFile(path=path, language=f.get('language',
                                                             None)))

        elif file_type == HTML5_FILE:
            node.add_file(
                files.HTMLZipFile(path=path, language=f.get('language', None)))

        elif file_type == THUMBNAIL_FILE:
            if 'encoding' in f:
                node.add_file(files.Base64ImageFile(encoding=f['encoding'], ))
            else:
                node.add_file(
                    files.ThumbnailFile(
                        path=path,
                        language=f.get('language', None),
                    ))

        elif file_type == SUBTITLES_FILE:
            if 'youtube_id' in f:
                node.add_file(
                    files.YouTubeSubtitleFile(youtube_id=f['youtube_id'],
                                              language=f['language']))
            else:
                node.add_file(
                    files.SubtitleFile(path=path, language=f['language']))

        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(
                f['path']))
def crawl_each_post(post_url):
    resp = requests.get(post_url, headers=HEADERS)
    soup = BeautifulSoup(resp.content, "html.parser")
    wrapper = soup.find('div', {'class': 'wpb_wrapper'})
    course_name = wrapper.find('div', {
        'class': 'vc_custom_heading'
    }).getText().strip()
    delimiters = " OF ", " FROM "
    regex_pattern = '|'.join(map(re.escape, delimiters))
    course = re.split(regex_pattern, course_name)[1]
    wpb_video_wrapper = wrapper.find_all('div', {'class': 'wpb_video_wrapper'})

    if wpb_video_wrapper:

        for each_wrapper in wpb_video_wrapper:
            video_url = each_wrapper.find('iframe').attrs["src"].split(
                "?feature")[0]
            video_id = video_url.split("/")[-1]

            ydl = youtube_dl.YoutubeDL({
                'outtmpl': './downloads/%(id)s.%(ext)s',
                'writeautomaticsub': True,
                'logger': LOGGER
            })

            with ydl:
                result = ydl.extract_info(
                    "http://www.youtube.com/watch?v={}".format(video_id),
                    download=True)
            if 'entries' in result:
                video = result['entries'][0]
            else:
                video = result

            video_title = video["title"]
            video_source_id = video_title.strip().replace(" ", "_")
            video_path = "{}/{}.mp4".format(DOWNLOAD_DIRECTORY, video_id)
            video_subtitle_path = "{}/{}.en.vtt".format(
                DOWNLOAD_DIRECTORY, video_id)
            video_file = files.VideoFile(path=video_path,
                                         language=languages.getlang('en').code)
            video_subtitle = files.SubtitleFile(
                path=video_subtitle_path,
                language=languages.getlang('en').code)
            video_node = nodes.VideoNode(
                source_id=video_source_id,
                title=video_title,
                files=[video_file, video_subtitle],
                license=CHANNEL_LICENSE,
                copyright_holder=COPYRIGHT_HOLDER,
            )

            if course not in EPISODE_DICT:
                EPISODE_DICT[course] = [video_node]
            else:
                EPISODE_DICT[course].append(video_node)
            LOGGER.info("   Uploading video - {}".format(video_title.strip()))
    else:
        LOGGER.info(
            "Format of the file is not supported by the sushi chef : {}".
            format(course_name))