Ejemplo n.º 1
0
def add_files(node, file_list):
    for f in file_list:

        path = f.get('path')
        if path is not None:
            abspath = get_abspath(path)      # NEW: expand  content://  -->  ./content/  in file paths
        else:
            abspath = None

        file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding'))

        if file_type == FileTypes.AUDIO_FILE:
            node.add_file(files.AudioFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.THUMBNAIL:
            node.add_file(files.ThumbnailFile(path=abspath))
        elif file_type == FileTypes.DOCUMENT_FILE:
            node.add_file(files.DocumentFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.HTML_ZIP_FILE:
            node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language')))
        elif file_type == FileTypes.VIDEO_FILE:
            node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings')))
        elif file_type == FileTypes.SUBTITLE_FILE:
            node.add_file(files.SubtitleFile(path=abspath, language=f['language']))
        elif file_type == FileTypes.BASE64_FILE:
            node.add_file(files.Base64ImageFile(encoding=f['encoding']))
        elif file_type == FileTypes.WEB_VIDEO_FILE:
            node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution')))
        elif file_type == FileTypes.YOUTUBE_VIDEO_FILE:
            node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution')))
            node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en'))
        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path']))
def scrape_iversity(channel):
    url = "{}/en/my/courses/rethinking-us-them-integration-and-diversity-in-europe/lesson_units".format(
        BASE_URL)
    LOGGER.info("   Scraping Migration Matters at {}".format(url))
    source = read_source(url)
    chapters = source.find_all('div', {'class': 'chapter-units-wrapper'})

    for chapter in chapters:
        title = str(chapter.find('div', {'class': 'chapter-title'}).string)
        source_id = title.strip().replace(" ", "_")
        topic = nodes.TopicNode(source_id=source_id, title=title)
        lessons = chapter.find_all('a', {'class': 'unit-wrapper'})

        for lesson in lessons:
            video_exists = lesson.find('i', {'class': 'unit_video'})
            video_title = str(
                lesson.find('span', {
                    'class': 'unit-title'
                }).string).strip()

            if video_exists:
                video_source_id = video_title.replace(" ", "_")
                video_url = "{}{}".format(BASE_URL, lesson.attrs["href"])
                video_source = read_source(video_url)
                video_info = video_source.find('video')
                video_subtitle_path = video_info.find('track', {
                    'kind': 'subtitles'
                }).attrs["src"]
                video_subtitle = files.SubtitleFile(
                    path=video_subtitle_path,
                    language=languages.getlang('en').code)
                video_link = video_info.find('source', {
                    'res': '480'
                }).attrs["src"]
                video_file = files.VideoFile(
                    path=video_link, language=languages.getlang('en').code)
                video_node = nodes.VideoNode(
                    source_id=video_source_id,
                    title=video_title,
                    files=[video_file, video_subtitle],
                    license=CHANNEL_LICENSE,
                    copyright_holder=COPYRIGHT_HOLDER)
                LOGGER.info("   Uploading video - {}".format(
                    video_title.strip()))
                topic.add_child(video_node)
            else:
                LOGGER.info(
                    "Format of the file is not supported by the sushi chef : {}"
                    .format(video_title))

        channel.add_child(topic)
Ejemplo n.º 3
0
def add_files(node, file_list):
    EXPECTED_FILE_TYPES = [
        VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, HTML5_FILE, THUMBNAIL_FILE,
        SUBTITLES_FILE
    ]

    for f in file_list:
        file_type = f.get('file_type')
        if file_type not in EXPECTED_FILE_TYPES:
            LOGGER.critical(file_type)
            raise NotImplementedError(
                'Unexpected File type found in channel json.')

        path = f.get('path')  # path can be an URL or a local path (or None)

        # handle different types of files
        if file_type == VIDEO_FILE:
            # handle three types of video files
            if 'youtube_id' in f:
                video_file = files.YouTubeVideoFile(
                    youtube_id=f['youtube_id'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            elif 'web_url' in f:
                video_file = files.WebVideoFile(
                    web_url=f['web_url'],
                    download_settings=f.get('download_settings', None),
                    high_resolution=f.get('high_resolution', True),
                    maxheight=f.get('maxheight', None),
                    language=f.get('language', None),
                )
            else:
                video_file = files.VideoFile(
                    path=f['path'],
                    language=f.get('language', None),
                    ffmpeg_settings=f.get('ffmpeg_settings'),
                )
            node.add_file(video_file)

        elif file_type == AUDIO_FILE:
            node.add_file(
                files.AudioFile(path=f['path'],
                                language=f.get('language', None)))

        elif file_type == DOCUMENT_FILE:
            node.add_file(
                files.DocumentFile(path=path, language=f.get('language',
                                                             None)))

        elif file_type == HTML5_FILE:
            node.add_file(
                files.HTMLZipFile(path=path, language=f.get('language', None)))

        elif file_type == THUMBNAIL_FILE:
            if 'encoding' in f:
                node.add_file(files.Base64ImageFile(encoding=f['encoding'], ))
            else:
                node.add_file(
                    files.ThumbnailFile(
                        path=path,
                        language=f.get('language', None),
                    ))

        elif file_type == SUBTITLES_FILE:
            if 'youtube_id' in f:
                node.add_file(
                    files.YouTubeSubtitleFile(youtube_id=f['youtube_id'],
                                              language=f['language']))
            else:
                node.add_file(
                    files.SubtitleFile(path=path, language=f['language']))

        else:
            raise UnknownFileTypeError("Unrecognized file type '{0}'".format(
                f['path']))
def crawl_each_post(post_url):
    resp = requests.get(post_url, headers=HEADERS)
    soup = BeautifulSoup(resp.content, "html.parser")
    wrapper = soup.find('div', {'class': 'wpb_wrapper'})
    course_name = wrapper.find('div', {
        'class': 'vc_custom_heading'
    }).getText().strip()
    delimiters = " OF ", " FROM "
    regex_pattern = '|'.join(map(re.escape, delimiters))
    course = re.split(regex_pattern, course_name)[1]
    wpb_video_wrapper = wrapper.find_all('div', {'class': 'wpb_video_wrapper'})

    if wpb_video_wrapper:

        for each_wrapper in wpb_video_wrapper:
            video_url = each_wrapper.find('iframe').attrs["src"].split(
                "?feature")[0]
            video_id = video_url.split("/")[-1]

            ydl = youtube_dl.YoutubeDL({
                'outtmpl': './downloads/%(id)s.%(ext)s',
                'writeautomaticsub': True,
                'logger': LOGGER
            })

            with ydl:
                result = ydl.extract_info(
                    "http://www.youtube.com/watch?v={}".format(video_id),
                    download=True)
            if 'entries' in result:
                video = result['entries'][0]
            else:
                video = result

            video_title = video["title"]
            video_source_id = video_title.strip().replace(" ", "_")
            video_path = "{}/{}.mp4".format(DOWNLOAD_DIRECTORY, video_id)
            video_subtitle_path = "{}/{}.en.vtt".format(
                DOWNLOAD_DIRECTORY, video_id)
            video_file = files.VideoFile(path=video_path,
                                         language=languages.getlang('en').code)
            video_subtitle = files.SubtitleFile(
                path=video_subtitle_path,
                language=languages.getlang('en').code)
            video_node = nodes.VideoNode(
                source_id=video_source_id,
                title=video_title,
                files=[video_file, video_subtitle],
                license=CHANNEL_LICENSE,
                copyright_holder=COPYRIGHT_HOLDER,
            )

            if course not in EPISODE_DICT:
                EPISODE_DICT[course] = [video_node]
            else:
                EPISODE_DICT[course].append(video_node)
            LOGGER.info("   Uploading video - {}".format(video_title.strip()))
    else:
        LOGGER.info(
            "Format of the file is not supported by the sushi chef : {}".
            format(course_name))