def test_set_language(): sub1 = SubtitleFile('path', language='en') sub2 = SubtitleFile('path', language=languages.getlang('es')) assert isinstance(sub1.language, str), "Subtitles must be converted to Language class" assert isinstance(sub2.language, str), "Subtitles can be passed as Langauge models" assert sub1.language == 'en', "Subtitles must have a language" assert sub2.language == 'es', "Subtitles must have a language" pytest.raises(TypeError, SubtitleFile, 'path', language='notalanguage')
def test_convertible_substitles_ar_ttml(youtube_test_file): """ Regression test to make sure correct lang_code is detected from .ttml data. """ local_path = os.path.join("tests", "testcontent", "downloaded", "testsubtitles_ar.ttml") assert os.path.exists(local_path) subtitle_file = SubtitleFile(local_path, language='ar') filename = subtitle_file.process_file() assert filename, 'conferted filename must exit' assert filename.endswith( '.vtt'), 'conferted filename must have .vtt extension'
def test_multiple_subs_can_be_added(video_file): """ Baseline check to make sure we're not dropping subtitle files on validate. """ assert os.path.exists("tests/testcontent/testsubtitles_ar.srt") video_node = VideoNode('vid-src-id', "Video", licenses.PUBLIC_DOMAIN) video_node.add_file(video_file) sub1 = SubtitleFile("tests/testcontent/testsubtitles_ar.srt", language='en') video_node.add_file(sub1) sub2 = SubtitleFile("tests/testcontent/testsubtitles_ar.srt", language='ar') video_node.add_file(sub2) video_node.validate() sub_files = [f for f in video_node.files if isinstance(f, SubtitleFile)] assert len(sub_files) == 2, 'Missing subtitles files!'
def test_duplicate_language_codes_fixed_by_validate(video_file): """ Video nodes should have at most one subtitle file for a particular lang code. """ assert os.path.exists("tests/testcontent/testsubtitles_ar.srt") video_node = VideoNode('vid-src-id', "Video", licenses.PUBLIC_DOMAIN) video_node.add_file(video_file) sub1 = SubtitleFile("tests/testcontent/testsubtitles_ar.srt", language='ar') video_node.add_file(sub1) # now let's add file with a duplicate language code... sub2 = SubtitleFile("tests/testcontent/testsubtitles_ar.srt", language='ar') video_node.add_file(sub2) video_node.validate() sub_files = [f for f in video_node.files if isinstance(f, SubtitleFile)] assert len(sub_files) == 1, 'Duplicate subtitles files not removed!'
def construct_channel(self, *args, **kwargs): channel = self.get_channel(**kwargs) videos_topic = TopicNode(source_id="/wiki/Category:Articles_containing_video_clips", title="Articles containing video clips") channel.add_child(videos_topic) thumbnail_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ee/A_Is_for_Atom_1953.webm/220px--A_Is_for_Atom_1953.webm.jpg' page = download_wikipedia_page('/wiki/Category:Articles_containing_video_clips', thumbnail_url, 'A Is for Atom') videos_topic.add_child(page) video_url = 'https://upload.wikimedia.org/wikipedia/commons/e/ee/A_Is_for_Atom_1953.webm' video_file = VideoFile(path=video_url) video_node = VideoNode(title='A Is for Atom 1953', source_id='A_Is_for_Atom_1953.webm', files=[video_file], license=licenses.PublicDomainLicense()) subtitle_url = 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3AA_Is_for_Atom_1953.webm&lang={}&trackformat=srt' subtitle_languages = [ 'en', 'es', ] for lang in subtitle_languages: subtitle_file = SubtitleFile(path=subtitle_url.format(lang), language=lang, subtitlesformat='srt') video_node.add_file(subtitle_file) videos_topic.add_child(video_node) return channel
def test_convertible_substitles_ar_srt(): """ Basic check that srt --> vtt conversion works. """ local_path = os.path.join("tests", "testcontent", "samples", "testsubtitles_ar.srt") assert os.path.exists(local_path) subtitle_file = SubtitleFile(local_path, language='ar') filename = subtitle_file.process_file() assert filename, 'converted filename must exist' assert filename.endswith( '.vtt'), 'converted filename must have .vtt extension' storage_path = config.get_storage_path(filename) with open(storage_path) as converted_vtt: filecontents = converted_vtt.read() check_words = 'لناس على' assert check_words in filecontents, 'missing check word in converted subs'
def subtitle_file(): local_path = os.path.join("tests", "testcontent", "generated", "testsubtitles.vtt") if not os.path.exists(local_path): with open(local_path, 'wb') as subtitlefile: subtitlefile.write(b'WEBVTT\n') subtitlefile.write(b'\n') subtitlefile.write(b'00:01.000 --> 00:04.250\n') subtitlefile.write(b'Testing subtitles\n') return SubtitleFile(local_path, language='en')
def test_convertible_substitles_noext_subtitlesformat(): """ Check that we handle correctly cases when path doesn't contain extenstion. """ local_path = os.path.join("tests", "testcontent", "downloaded", "testsubtitles_ar.ttml") assert os.path.exists(local_path) local_path_no_ext = local_path.replace('.ttml', '') copyfile(local_path, local_path_no_ext) assert os.path.exists(local_path_no_ext) subtitle_file = SubtitleFile( local_path_no_ext, language='ar', subtitlesformat='ttml' # settting subtitlesformat becaue no ext ) filename = subtitle_file.process_file() assert filename, 'conferted filename must exit' assert filename.endswith( '.vtt'), 'conferted filename must have .vtt extension'
def test_convertible_substitles_from_pressurcooker(pressurcooker_test_files): """ Try to load all the test files used in pressurecooker as riceccooker `SubtitleFile`s. All subs have the appropriate extension so no need to specify `subtitlesformat`. """ for fixture in pressurcooker_test_files: localpath = fixture['localpath'] assert os.path.exists( localpath), 'Error mising local test file ' + localpath subtitle_file = SubtitleFile(localpath, language=fixture['language']) filename = subtitle_file.process_file() assert filename, 'conferted filename must exit' assert filename.endswith( '.vtt'), 'conferted filename must have .vtt extension' storage_path = config.get_storage_path(filename) with open(storage_path) as converted_vtt: filecontents = converted_vtt.read() assert fixture[ 'check_words'] in filecontents, 'missing check_words in converted subs'
def test_convertible_substitles_weirdext_subtitlesformat(): """ Check that we handle cases when ext cannot be guessed from URL or localpath. Passing `subtitlesformat` allows chef authors to manually specify subs format. """ subs_url = 'https://commons.wikimedia.org/w/api.php?' \ + 'action=timedtext&title=File%3AA_Is_for_Atom_1953.webm&lang=es&trackformat=srt' subtitle_file = SubtitleFile( subs_url, language='es', subtitlesformat= 'srt' # set subtitlesformat when can't inferr ext form url ) filename = subtitle_file.process_file() assert filename, 'conferted filename must exit' assert filename.endswith( '.vtt'), 'conferted filename must have .vtt extension' storage_path = config.get_storage_path(filename) with open(storage_path) as converted_vtt: filecontents = converted_vtt.read() assert 'El total de los protones y neutrones de un átomo' in filecontents, \ 'missing check words in converted subs'
def test_bad_subtitles_raises(bad_subtitles_file): subs_file = SubtitleFile(bad_subtitles_file.name, language='en') pytest.raises(ValueError, subs_file.process_file)
def create_node(node, assessment_dict, subtitle_path, vtt_videos, base_path, lite_version, lang_code): kind = node.get('kind') # Exercise node creation if kind == 'Exercise': child_node = ExerciseNode( source_id=node['id'], title=node['title'], exercise_data={ 'mastery_model': node.get('suggested_completion_criteria') }, description='' if node.get("description") is None else node.get( "description", '')[:400], license=licenses.ALL_RIGHTS_RESERVED, thumbnail=node.get('image_url_256'), ) # build exercise urls for previews full_path = base_path + node.get('path').strip('khan') slug = full_path.split('/')[-2] full_path = full_path.replace(slug, 'e') + slug # attach Perseus questions to Exercises for item in node['all_assessment_items']: # we replace all references to assessment images with the local file path to the image for match in re.finditer(FILE_URL_REGEX, assessment_dict[item['id']]["item_data"]): file_path = str(match.group(0)).replace('\\', '') file_path = file_path.replace(REPLACE_STRING, IMAGE_DL_LOCATION) assessment_dict[item['id']]["item_data"] = re.sub( FILE_URL_REGEX, file_path, assessment_dict[item['id']]["item_data"], 1) question = PerseusQuestion( id=item['id'], raw_data=assessment_dict[item['id']]['item_data'], source_url=full_path if not lite_version else None, ) child_node.add_question(question) # Topic node creation elif kind == 'Topic': child_node = TopicNode( source_id=node["id"], title=node["title"], description='' if node.get("description") is None else node.get( "description", '')[:400]) # Video node creation elif kind == 'Video': # standard download url for KA videos download_url = "https://cdn.kastatic.org/KA-youtube-converted/{0}.mp4/{1}.mp4".format( node['youtube_id'], node['youtube_id']) files = [VideoFile(download_url)] if node['youtube_id'] in vtt_videos: files.append( SubtitleFile(subtitle_path + '/{}.vtt'.format(node['youtube_id']), language=getlang(lang_code))) child_node = VideoNode( source_id=node["id"], title=node["title"], description='' if node.get("description") is None else node.get( "description", '')[:400], files=files, thumbnail=node.get('image_url'), license=licenses.CC_BY_NC_SA) else: # unknown content file format return None return child_node
def make_subtitles_file(subtitles_file_file, language='en'): """ Creates a VideoFile object with path taken from `video_file_file.name`. """ return SubtitleFile(subtitles_file_file.name, language=language)