Esempio n. 1
0
def test_youtube_edgecases_alpha2_codes(simplified_chinese_codes,
        traditional_chinese_codes, taiwan_chinese_codes):
    # check old language code for Hebrew works `iw`
    lang_obj = languages.getlang_by_alpha2('iw')
    assert lang_obj is not None, 'Hebrew not found'
    assert lang_obj.code == "he", 'Wrong code'
    assert lang_obj.name == "Hebrew (modern)", 'Wrong name'
    assert lang_obj.native_name == "עברית", 'Wrong native_name'

    # Check all Simplified Chinese codes are resolved correctly to zh-CN
    for lang_code in simplified_chinese_codes:
        lang_obj = languages.getlang_by_alpha2(lang_code)
        assert lang_obj is not None, 'Simplified Chinese not found'
        assert lang_obj.code == "zh-CN", 'Wrong internal repr. code'
        assert lang_obj.name == "Chinese, Simplified", 'Wrong name'
        assert lang_obj.native_name == "中国大陆", 'Wrong native_name'

    # Check all Traditional Chinese codes are resolved correctly to zh-Hant
    for lang_code in traditional_chinese_codes:
        lang_obj = languages.getlang_by_alpha2(lang_code)
        assert lang_obj is not None, 'Traditional Chinese not found'
        assert lang_obj.code == "zh-Hant", 'Wrong internal repr. code'
        assert lang_obj.name == "Chinese, Traditional", 'Wrong name'
        assert lang_obj.native_name == "漢語 (繁體字)", 'Wrong native_name'

    # Check all Taiwanese langauge codes are resolved correctly to zh-TW
    for lang_code in taiwan_chinese_codes:
        lang_obj = languages.getlang_by_alpha2(lang_code)
        assert lang_obj is not None, 'Taiwan Chinese not found'
        assert lang_obj.code == "zh-TW", 'Wrong internal repr. code'
        assert lang_obj.name == "Chinese, Taiwan", 'Wrong name'
        assert lang_obj.native_name == "漢語 (臺灣)", 'Wrong native_name'
def fetch_video(video):
    youtube_id = video['id']
    title = video['title']
    description = video['description']
    youtube_url = video['webpage_url']
    subtitle_languages = video['subtitles'].keys()

    print("    Fetching video data: %s (%s)" % (title, youtube_url))

    video_node = nodes.VideoNode(
        source_id=youtube_id,
        title=truncate_metadata(title),
        license=LICENSE,
        description=truncate_description(description),
        derive_thumbnail=True,
        language="en",
        files=[files.YouTubeVideoFile(youtube_id=youtube_id)],
    )

    # Add subtitles in whichever languages are available.
    for language in subtitle_languages:
        # TODO(david): Should catch exception thrown by
        # files.YouTubeSubtitleFile rather than breaking abstraction.
        if languages.getlang(language) or languages.getlang_by_alpha2(
                language):
            video_node.add_file(
                files.YouTubeSubtitleFile(youtube_id=youtube_id,
                                          language=language))
        else:
            print("WARNING: Subtitle language %s not found in languages file" %
                  language)

    return video_node
def test_known_alpha2_codes():
    lang_obj = languages.getlang_by_alpha2('en')
    assert lang_obj is not None, 'English not found'
    assert lang_obj.code == "en", 'Wrong code'
    assert lang_obj.name == "English", 'Wrong name'
    assert lang_obj.native_name == "English", 'Wrong native_name'

    lang_obj = languages.getlang_by_alpha2('zu')
    assert lang_obj is not None, 'Zulu not found'
    assert lang_obj.code == "zul", 'Wrong internal repr. code'
    assert lang_obj.name == "Zulu", 'Wrong name'
    assert lang_obj.native_name == "isiZulu", 'Wrong native_name'

    lang_obj = languages.getlang_by_alpha2('pt')
    assert lang_obj is not None, 'Portuguese not found'
    assert lang_obj.code == "pt", 'Wrong code'
    assert lang_obj.name == "Portuguese", 'Wrong name'
    assert lang_obj.native_name == "Português", 'Wrong native_name'
Esempio n. 4
0
def test_known_alpha2_codes():
    lang_obj = languages.getlang_by_alpha2('en')
    assert lang_obj is not None, 'English not found'
    assert lang_obj.code == "en", 'Wrong code'
    assert lang_obj.name == "English", 'Wrong name'
    assert lang_obj.native_name == "English", 'Wrong native_name'

    lang_obj = languages.getlang_by_alpha2('zu')
    assert lang_obj is not None, 'Zulu not found'
    assert lang_obj.code == "zul", 'Wrong internal repr. code'
    assert lang_obj.name == "Zulu", 'Wrong name'
    assert lang_obj.native_name == "isiZulu", 'Wrong native_name'

    lang_obj = languages.getlang_by_alpha2('pt')
    assert lang_obj is not None, 'Portuguese not found'
    assert lang_obj.code == "pt", 'Wrong code'
    assert lang_obj.name == "Portuguese", 'Wrong name'
    assert lang_obj.native_name == "Português", 'Wrong native_name'
Esempio n. 5
0
def test_known_alpha2_codes():
    lang_obj = languages.getlang_by_alpha2("en")
    assert lang_obj is not None, "English not found"
    assert lang_obj.code == "en", "Wrong code"
    assert lang_obj.name == "English", "Wrong name"
    assert lang_obj.native_name == "English", "Wrong native_name"

    lang_obj = languages.getlang_by_alpha2("zu")
    assert lang_obj is not None, "Zulu not found"
    assert lang_obj.code == "zul", "Wrong internal repr. code"
    assert lang_obj.name == "Zulu", "Wrong name"
    assert lang_obj.native_name == "isiZulu", "Wrong native_name"

    lang_obj = languages.getlang_by_alpha2("pt")
    assert lang_obj is not None, "Portuguese not found"
    assert lang_obj.code == "pt", "Wrong code"
    assert lang_obj.name == "Portuguese", "Wrong name"
    assert lang_obj.native_name == "Português", "Wrong native_name"
Esempio n. 6
0
 def __init__(self, youtube_id, language=None, **kwargs):
     self.youtube_url = 'http://www.youtube.com/watch?v={}'.format(youtube_id)
     if isinstance(language, languages.Language):  # for backward compatibility
         language = language.code
     self.youtube_language = language  # youtube language code (can differ from internal repr.)
     language_obj = languages.getlang(language)   # lookup `language` using internal representation
     # if language_obj not None, we know `language` is a valid language_id in the internal repr.
     if language_obj is None:  # if `language` not found using internal repr.
         language_obj = languages.getlang_by_alpha2(language)  # try to match by two-letter ISO code
         language = language_obj.code   # update `language` argument from internal repr. language_id
     super(YouTubeSubtitleFile, self).__init__(language=language, **kwargs)
     assert self.language, "Subtitles must have a language"
Esempio n. 7
0
    def download_and_transform_file(self, path):
        """
        Download subtitles file at `path` and transform it to `.vtt` if necessary.
        Args: path (URL or local path)
        Returns: filename of final .vtt file
        """
        key = "DOWNLOAD:{}".format(path)
        cache_file = get_cache_filename(key)
        if not config.UPDATE and not cache_is_outdated(path, cache_file):
            return cache_file

        config.LOGGER.info("\tDownloading {}".format(path))

        with tempfile.NamedTemporaryFile() as temp_in_file,\
                tempfile.NamedTemporaryFile() as temp_out_file:
            write_and_get_hash(path, temp_in_file)
            temp_in_file.seek(0)

            converter = build_subtitle_converter_from_file(
                temp_in_file.name, self.subtitlesformat)

            # We'll assume the provided file is in the passed language in this case
            if len(converter.get_language_codes()) == 1 \
                    and converter.has_language(LANGUAGE_CODE_UNKNOWN):
                converter.replace_unknown_language(self.language)

            convert_lang_code = self.language

            # Language is not present, let's try different codes
            if not converter.has_language(self.language):
                for lang_code in converter.get_language_codes():
                    language = languages.getlang_by_alpha2(lang_code)

                    if language and language.code == self.language:
                        convert_lang_code = lang_code
                        break
                else:
                    raise InvalidSubtitleLanguageError(
                        "Missing language '{}' in subtitle file".format(
                            self.language))

            converter.write(temp_out_file.name, convert_lang_code)

            temp_out_file.seek(0)
            file_hash = get_hash(temp_out_file.name)

            filename = '{0}.{ext}'.format(file_hash, ext=file_formats.VTT)

            temp_out_file.seek(0)
            copy_file_to_storage(filename, temp_out_file)
            FILECACHE.set(key, bytes(filename, "utf-8"))

        return filename
Esempio n. 8
0
def _get_language_with_alpha2_fallback(language_code):
    """
    Lookup language code `language_code` (string) in the internal language codes,
    and if that fails, try to map map `language_code` to the internal represention
    using the `getlang_by_alpha2` helper method.
    Returns either a le-utils Language object or None if both lookups fail.
    """
    # 1. try to lookup `language` using internal representation
    language_obj = languages.getlang(language_code)
    # if language_obj not None, we know `language` is a valid language_id in the internal repr.
    if language_obj is None:
        # 2. try to match by two-letter ISO code
        language_obj = languages.getlang_by_alpha2(language_code)
    return language_obj
def test_youtube_edgecases_alpha2_codes(simplified_chinese_codes, traditional_chinese_codes):
    # check old language code for Hebrew works `iw`
    lang_obj = languages.getlang_by_alpha2('iw')
    assert lang_obj is not None, 'Hebrew not found'
    assert lang_obj.code == "he", 'Wrong code'
    assert lang_obj.name == "Hebrew (modern)", 'Wrong name'
    assert lang_obj.native_name == "עברית", 'Wrong native_name'

    # Check all Simplified Chinese codes are resolved correctly to zh-CN
    for lang_code in simplified_chinese_codes:
        lang_obj = languages.getlang_by_alpha2(lang_code)
        assert lang_obj is not None, 'Simplified Chinese not found'
        assert lang_obj.code == "zh-CN", 'Wrong internal repr. code'
        assert lang_obj.name == "Chinese, Simplified", 'Wrong name'
        assert lang_obj.native_name == "中国大陆", 'Wrong native_name'

    # Check all Traditional Chinese codes are resolved correctly to zh-TW
    for lang_code in traditional_chinese_codes:
        lang_obj = languages.getlang_by_alpha2(lang_code)
        assert lang_obj is not None, 'Traditional Chinese not found'
        assert lang_obj.code == "zh-TW", 'Wrong internal repr. code'
        assert lang_obj.name == "Chinese, Traditional", 'Wrong name'
        assert lang_obj.native_name == "正體字/繁體字", 'Wrong native_name'
    def __init__(self, lang, *args, **kwargs):
        self.lang_id = lang
        self.lang_data = languages.getlang_by_alpha2(self.lang_id)
        # Code in __init__ may call get_channel, which expects channel_info to be defined,
        # so we do the initialization here.
        self.channel_info = {
            # FIXME: Translate language titles
            'CHANNEL_TITLE': 'ProFuturo ({})'.format(self.lang_data.native_name),
            'CHANNEL_SOURCE_DOMAIN': 'profuturo.education',  # where you got the content
            'CHANNEL_SOURCE_ID': 'profuturo-'+self.lang_id,  # channel's unique id
            'CHANNEL_LANGUAGE': self.lang_id,  # le_utils language code
            'CHANNEL_DESCRIPTION': '',  # (optional)
        }

        super(ProFuturoChef, self).__init__(*args, **kwargs)
Esempio n. 11
0
def test_unknown_alpha2_code():
    lang_obj = languages.getlang_by_alpha2('zz')
    assert lang_obj is None, 'Uknown code zz returned non-None'
Esempio n. 12
0
def test_unknown_alpha2_code():
    lang_obj = languages.getlang_by_alpha2('zz')
    assert lang_obj is None, 'Uknown code zz returned non-None'
Esempio n. 13
0
def subtitle_upload(request):
    # File will be converted to VTT format
    ext = file_formats.VTT
    language_id = request.META.get('HTTP_LANGUAGE')
    content_file = request.FILES.values()[0]

    with NamedTemporaryFile() as temp_file:
        try:
            converter = build_subtitle_converter(
                unicode(content_file.read(), 'utf-8'))
            convert_language_code = language_id

            # We're making the assumption here that language the user selected is truly the caption
            # file's language if it's unknown
            if len(converter.get_language_codes()) == 1 \
                    and converter.has_language(LANGUAGE_CODE_UNKNOWN):
                converter.replace_unknown_language(language_id)

            # determine if the request language exists by another code, otherwise we can't continue
            if not converter.has_language(convert_language_code):
                for language_code in converter.get_language_codes():
                    language = getlang_by_alpha2(language_code)
                    if language and language.code == language_id:
                        convert_language_code = language_code
                        break
                else:
                    return HttpResponseBadRequest(
                        "Language '{}' not present in subtitle file".format(
                            language_id))

            converter.write(temp_file.name, convert_language_code)
        except InvalidSubtitleFormatError as ex:
            return HttpResponseBadRequest(
                "Subtitle conversion failed: {}".format(ex))

        temp_file.seek(0)
        converted_file = DjFile(temp_file)

        checksum = get_hash(converted_file)
        size = converted_file.size
        request.user.check_space(size, checksum)

        file_object = File(
            file_size=size,
            file_on_disk=converted_file,
            checksum=checksum,
            file_format_id=ext,
            original_filename=request.FILES.values()[0]._name,
            preset_id=request.META.get('HTTP_PRESET'),
            language_id=language_id,
            uploaded_by=request.user,
        )
        file_object.save()

    return HttpResponse(
        json.dumps({
            "success":
            True,
            "filename":
            str(file_object),
            "file":
            JSONRenderer().render(FileSerializer(file_object).data)
        }))
def build_lang_lookup_table(FEED_ROOT_URL):
    """
    Extracts all the root URLs of the languages, based on the links
    with face `Languages` in FEED_ROOT_URL.
    """
    OPDS_LANG_ROOTS = {}

    # Check for languages we don't yet support in Kolibri.
    langs_not_found = []

    feed = feedparser.parse(FEED_ROOT_URL)
    lang_links = []
    for link in feed.feed.links:
        if 'opds:facetgroup' in link:
            fg = link['opds:facetgroup']
            if fg == 'Languages':
                lang_links.append(link)

    # Build lookup table    lang_code --> dict with info about content in that langauge
    # where lang_code is the Learning Equality internal language codes defined in le_utils
    # Assume the chef scrill will be run on the command line using   lang=lang_code
    # E.g. lang_code for Zulu is `zul`, for Amharic it's `am`, and for Nepali it's `ne-NP`
    for link in lang_links:
        href = link['href']
        m = _LANG_CODE_RE.search(href)
        if not m:
            raise ValueError('Cannot find language code in href: ' + str(href))
        gdl_lang_code = m.groupdict()['gdl_lang_code']
        lang_title = link['title']
        if lang_title == "isiNdebele seSewula":
            lang_title = "isiNdebele"
        elif lang_title == 'বাঙালি':
            lang_title = 'বাংলা'

        print('Processig lang_title', lang_title)
        #
        # ATTEMPT 1 ##############
        lang_obj = getlang_by_name(lang_title)
        if not lang_obj:
            lang_obj = getlang_by_native_name(lang_title)
            #
            # ATTEMPT 2 #########
            if not lang_obj:
                pyc_lang = pycountry.languages.lookup(gdl_lang_code)
                code = pyc_lang.alpha_3
                if hasattr(pyc_lang, 'alpha_2'):
                    #
                    # ATTEMPT 3 ##############
                    code = pyc_lang.alpha_2

                # getlang_by_alpha2 is a misnomer, codes can be alpha2, alpha3, or lang+locale.
                lang_obj = getlang_by_alpha2(code)
                if not lang_obj:
                    langs_not_found.append((pyc_lang, lang_title))
                    print('ERROR could not find Kolibri lang info for ',
                          pyc_lang)
                    continue
        lang_code = lang_obj.code
        OPDS_LANG_ROOTS[lang_code] = dict(
            alpha_3=gdl_lang_code,
            lang_title=lang_title,
            href=href,
            name=lang_obj.name,
            native_name=lang_obj.native_name,
        )

    # For now, make missing languages a hard error so we can evaluate new language support case-by-case.
    if len(langs_not_found) > 0:
        lang_codes = []
        for pyc_lang, lang_title in langs_not_found:
            lang_codes.append(pyc_lang.alpha_3)
        message = "The following languages are not yet supported in Kolibri: {}".format(
            ",".join(lang_codes))
        assert len(langs_not_found) == 0, message

    return OPDS_LANG_ROOTS