Exemple #1
0
def test_set_language():
    sub1 = SubtitleFile('path', language='en')
    sub2 = SubtitleFile('path', language=languages.getlang('es'))
    assert isinstance(sub1.language,
                      str), "Subtitles must be converted to Language class"
    assert isinstance(sub2.language,
                      str), "Subtitles can be passed as Langauge models"
    assert sub1.language == 'en', "Subtitles must have a language"
    assert sub2.language == 'es', "Subtitles must have a language"
    pytest.raises(TypeError, SubtitleFile, 'path', language='notalanguage')
Exemple #2
0
def test_convertible_substitles_ar_ttml(youtube_test_file):
    """
    Regression test to make sure correct lang_code is detected from .ttml data.
    """
    local_path = os.path.join("tests", "testcontent", "downloaded",
                              "testsubtitles_ar.ttml")
    assert os.path.exists(local_path)
    subtitle_file = SubtitleFile(local_path, language='ar')
    filename = subtitle_file.process_file()
    assert filename, 'conferted filename must exit'
    assert filename.endswith(
        '.vtt'), 'conferted filename must have .vtt extension'
def test_multiple_subs_can_be_added(video_file):
    """
    Baseline check to make sure we're not dropping subtitle files on validate.
    """
    assert os.path.exists("tests/testcontent/testsubtitles_ar.srt")
    video_node = VideoNode('vid-src-id', "Video", licenses.PUBLIC_DOMAIN)
    video_node.add_file(video_file)
    sub1 = SubtitleFile("tests/testcontent/testsubtitles_ar.srt", language='en')
    video_node.add_file(sub1)
    sub2 = SubtitleFile("tests/testcontent/testsubtitles_ar.srt", language='ar')
    video_node.add_file(sub2)
    video_node.validate()
    sub_files = [f for f in video_node.files if isinstance(f, SubtitleFile)]
    assert len(sub_files) == 2, 'Missing subtitles files!'
def test_duplicate_language_codes_fixed_by_validate(video_file):
    """
    Video nodes should have at most one subtitle file for a particular lang code.
    """
    assert os.path.exists("tests/testcontent/testsubtitles_ar.srt")
    video_node = VideoNode('vid-src-id', "Video", licenses.PUBLIC_DOMAIN)
    video_node.add_file(video_file)
    sub1 = SubtitleFile("tests/testcontent/testsubtitles_ar.srt", language='ar')
    video_node.add_file(sub1)
    # now let's add file with a duplicate language code...
    sub2 = SubtitleFile("tests/testcontent/testsubtitles_ar.srt", language='ar')
    video_node.add_file(sub2)
    video_node.validate()
    sub_files = [f for f in video_node.files if isinstance(f, SubtitleFile)]
    assert len(sub_files) == 1, 'Duplicate subtitles files not removed!'
Exemple #5
0
    def construct_channel(self, *args, **kwargs):

        channel = self.get_channel(**kwargs)
        videos_topic = TopicNode(source_id="/wiki/Category:Articles_containing_video_clips",
                                 title="Articles containing video clips")
        channel.add_child(videos_topic)

        thumbnail_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ee/A_Is_for_Atom_1953.webm/220px--A_Is_for_Atom_1953.webm.jpg'
        page = download_wikipedia_page('/wiki/Category:Articles_containing_video_clips',
                                       thumbnail_url, 'A Is for Atom')
        videos_topic.add_child(page)

        video_url = 'https://upload.wikimedia.org/wikipedia/commons/e/ee/A_Is_for_Atom_1953.webm'
        video_file = VideoFile(path=video_url)
        video_node = VideoNode(title='A Is for Atom 1953', source_id='A_Is_for_Atom_1953.webm',
                               files=[video_file], license=licenses.PublicDomainLicense())

        subtitle_url = 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3AA_Is_for_Atom_1953.webm&lang={}&trackformat=srt'
        subtitle_languages = [
            'en',
            'es',
        ]
        for lang in subtitle_languages:
            subtitle_file = SubtitleFile(path=subtitle_url.format(lang), language=lang, subtitlesformat='srt')
            video_node.add_file(subtitle_file)

        videos_topic.add_child(video_node)

        return channel
Exemple #6
0
def test_convertible_substitles_ar_srt():
    """
    Basic check that srt --> vtt conversion works.
    """
    local_path = os.path.join("tests", "testcontent", "samples",
                              "testsubtitles_ar.srt")
    assert os.path.exists(local_path)
    subtitle_file = SubtitleFile(local_path, language='ar')
    filename = subtitle_file.process_file()
    assert filename, 'converted filename must exist'
    assert filename.endswith(
        '.vtt'), 'converted filename must have .vtt extension'
    storage_path = config.get_storage_path(filename)
    with open(storage_path) as converted_vtt:
        filecontents = converted_vtt.read()
        check_words = 'لناس على'
        assert check_words in filecontents, 'missing check word in converted subs'
Exemple #7
0
def subtitle_file():
    local_path = os.path.join("tests", "testcontent", "generated",
                              "testsubtitles.vtt")
    if not os.path.exists(local_path):
        with open(local_path, 'wb') as subtitlefile:
            subtitlefile.write(b'WEBVTT\n')
            subtitlefile.write(b'\n')
            subtitlefile.write(b'00:01.000 --> 00:04.250\n')
            subtitlefile.write(b'Testing subtitles\n')
    return SubtitleFile(local_path, language='en')
Exemple #8
0
def test_convertible_substitles_noext_subtitlesformat():
    """
    Check that we handle correctly cases when path doesn't contain extenstion.
    """
    local_path = os.path.join("tests", "testcontent", "downloaded",
                              "testsubtitles_ar.ttml")
    assert os.path.exists(local_path)
    local_path_no_ext = local_path.replace('.ttml', '')
    copyfile(local_path, local_path_no_ext)
    assert os.path.exists(local_path_no_ext)
    subtitle_file = SubtitleFile(
        local_path_no_ext,
        language='ar',
        subtitlesformat='ttml'  # settting subtitlesformat becaue no ext
    )
    filename = subtitle_file.process_file()
    assert filename, 'conferted filename must exit'
    assert filename.endswith(
        '.vtt'), 'conferted filename must have .vtt extension'
Exemple #9
0
def test_convertible_substitles_from_pressurcooker(pressurcooker_test_files):
    """
    Try to load all the test files used in pressurecooker as riceccooker `SubtitleFile`s.
    All subs have the appropriate extension so no need to specify `subtitlesformat`.
    """
    for fixture in pressurcooker_test_files:
        localpath = fixture['localpath']
        assert os.path.exists(
            localpath), 'Error mising local test file ' + localpath
        subtitle_file = SubtitleFile(localpath, language=fixture['language'])
        filename = subtitle_file.process_file()
        assert filename, 'conferted filename must exit'
        assert filename.endswith(
            '.vtt'), 'conferted filename must have .vtt extension'
        storage_path = config.get_storage_path(filename)
        with open(storage_path) as converted_vtt:
            filecontents = converted_vtt.read()
            assert fixture[
                'check_words'] in filecontents, 'missing check_words in converted subs'
Exemple #10
0
def test_convertible_substitles_weirdext_subtitlesformat():
    """
    Check that we handle cases when ext cannot be guessed from URL or localpath.
    Passing `subtitlesformat` allows chef authors to manually specify subs format.
    """
    subs_url = 'https://commons.wikimedia.org/w/api.php?' \
        + 'action=timedtext&title=File%3AA_Is_for_Atom_1953.webm&lang=es&trackformat=srt'
    subtitle_file = SubtitleFile(
        subs_url,
        language='es',
        subtitlesformat=
        'srt'  # set subtitlesformat when can't inferr ext form url
    )
    filename = subtitle_file.process_file()
    assert filename, 'conferted filename must exit'
    assert filename.endswith(
        '.vtt'), 'conferted filename must have .vtt extension'
    storage_path = config.get_storage_path(filename)
    with open(storage_path) as converted_vtt:
        filecontents = converted_vtt.read()
        assert 'El total de los protones y neutrones de un átomo' in filecontents, \
            'missing check words in converted subs'
Exemple #11
0
def test_bad_subtitles_raises(bad_subtitles_file):
    subs_file = SubtitleFile(bad_subtitles_file.name, language='en')
    pytest.raises(ValueError, subs_file.process_file)
def create_node(node, assessment_dict, subtitle_path, vtt_videos, base_path,
                lite_version, lang_code):

    kind = node.get('kind')
    # Exercise node creation
    if kind == 'Exercise':
        child_node = ExerciseNode(
            source_id=node['id'],
            title=node['title'],
            exercise_data={
                'mastery_model': node.get('suggested_completion_criteria')
            },
            description='' if node.get("description") is None else node.get(
                "description", '')[:400],
            license=licenses.ALL_RIGHTS_RESERVED,
            thumbnail=node.get('image_url_256'),
        )

        # build exercise urls for previews
        full_path = base_path + node.get('path').strip('khan')
        slug = full_path.split('/')[-2]
        full_path = full_path.replace(slug, 'e') + slug

        # attach Perseus questions to Exercises
        for item in node['all_assessment_items']:
            # we replace all references to assessment images with the local file path to the image
            for match in re.finditer(FILE_URL_REGEX,
                                     assessment_dict[item['id']]["item_data"]):
                file_path = str(match.group(0)).replace('\\', '')
                file_path = file_path.replace(REPLACE_STRING,
                                              IMAGE_DL_LOCATION)
                assessment_dict[item['id']]["item_data"] = re.sub(
                    FILE_URL_REGEX, file_path,
                    assessment_dict[item['id']]["item_data"], 1)
            question = PerseusQuestion(
                id=item['id'],
                raw_data=assessment_dict[item['id']]['item_data'],
                source_url=full_path if not lite_version else None,
            )
            child_node.add_question(question)

    # Topic node creation
    elif kind == 'Topic':
        child_node = TopicNode(
            source_id=node["id"],
            title=node["title"],
            description='' if node.get("description") is None else node.get(
                "description", '')[:400])

    # Video node creation
    elif kind == 'Video':
        # standard download url for KA videos
        download_url = "https://cdn.kastatic.org/KA-youtube-converted/{0}.mp4/{1}.mp4".format(
            node['youtube_id'], node['youtube_id'])
        files = [VideoFile(download_url)]
        if node['youtube_id'] in vtt_videos:
            files.append(
                SubtitleFile(subtitle_path +
                             '/{}.vtt'.format(node['youtube_id']),
                             language=getlang(lang_code)))
        child_node = VideoNode(
            source_id=node["id"],
            title=node["title"],
            description='' if node.get("description") is None else node.get(
                "description", '')[:400],
            files=files,
            thumbnail=node.get('image_url'),
            license=licenses.CC_BY_NC_SA)

    else:  # unknown content file format
        return None

    return child_node
Exemple #13
0
def make_subtitles_file(subtitles_file_file, language='en'):
    """
    Creates a VideoFile object with path taken from `video_file_file.name`.
    """
    return SubtitleFile(subtitles_file_file.name, language=language)