Esempio n. 1
0
def test_known_names():
    lang_obj = languages.getlang_by_name('English')
    assert lang_obj is not None, 'English not found'
    assert lang_obj.code == "en", 'Wrong code'
    assert lang_obj.name == "English", 'Wrong name'
    assert lang_obj.native_name == "English", 'Wrong native_name'

    lang_obj = languages.getlang_by_name('Zulu')
    assert lang_obj is not None, 'Zulu not found'
    assert lang_obj.code == "zul", 'Wrong internal repr. code'
    assert lang_obj.name == "Zulu", 'Wrong name'
    assert lang_obj.native_name == "isiZulu", 'Wrong native_name'

    # NOTE: Currently only support full-name matching so would have to lookup by
    #       "name, country" to get local language version
    lang_obj = languages.getlang_by_name('Portuguese, Brazil')
    assert lang_obj is not None, 'Brazilian Portuguese not found'
    assert lang_obj.code == "pt-BR", 'Wrong internal repr. code'
    assert lang_obj.name == "Portuguese, Brazil", 'Wrong name'
    assert lang_obj.native_name == "Português (Brasil)", 'Wrong native_name'

    # NOTE: Currently only support full match lookups where multiple language
    #       specified spearated by semicolons, e.g. "Scottish Gaelic; Gaelic"
    lang_obj = languages.getlang_by_name('Scottish Gaelic; Gaelic')
    assert lang_obj is not None, 'Scottish Gaelic; Gaelic not found'
    assert lang_obj.code == "gd", 'Wrong internal repr. code'
    assert lang_obj.name == "Scottish Gaelic; Gaelic", 'Wrong name'
    assert lang_obj.native_name == "Gàidhlig", 'Wrong native_name'
Esempio n. 2
0
def get_khan_topic_tree(lang="en", curr_key=None):
    if lang == "sw":
        response = make_request(V2_API_URL.format(lang="swa",
                                                  projection=PROJECTION_KEYS),
                                timeout=120)
    else:
        response = make_request(V2_API_URL.format(lang=lang,
                                                  projection=PROJECTION_KEYS),
                                timeout=120)

    topic_tree = ujson.loads(response.content)
    # if name of lang is passed in, get language code
    if getlang_by_name(lang):
        lang = getlang_by_name(lang).primary_code

    if lang not in SUPPORTED_LANGS:
        global translations
        translations = retrieve_translations(lang_code=lang)

    # Flatten node_data
    flattened_tree = [
        node for node_list in topic_tree.values() for node in node_list
    ]

    # convert to dict with ids as keys
    tree_dict = {node["id"]: node for node in flattened_tree}

    return _recurse_create(tree_dict["x00000000"], tree_dict, lang=lang)
def test_known_names():
    lang_obj = languages.getlang_by_name('English')
    assert lang_obj is not None, 'English not found'
    assert lang_obj.code == "en", 'Wrong code'
    assert lang_obj.name == "English", 'Wrong name'
    assert lang_obj.native_name == "English", 'Wrong native_name'

    lang_obj = languages.getlang_by_name('Zulu')
    assert lang_obj is not None, 'Zulu not found'
    assert lang_obj.code == "zul", 'Wrong internal repr. code'
    assert lang_obj.name == "Zulu", 'Wrong name'
    assert lang_obj.native_name == "isiZulu", 'Wrong native_name'

    # NOTE: Currently only support full-name matching so would have to lookup by
    #       "name, country" to get local language version
    lang_obj = languages.getlang_by_name('Portuguese, Brazil')
    assert lang_obj is not None, 'Brazilian Portuguese not found'
    assert lang_obj.code == "pt-BR", 'Wrong internal repr. code'
    assert lang_obj.name == "Portuguese, Brazil", 'Wrong name'
    assert lang_obj.native_name == "Português (Brasil)", 'Wrong native_name'

    # NOTE: Currently only support full match lookups where multiple language
    #       specified spearated by semicolons, e.g. "Scottish Gaelic; Gaelic"
    lang_obj = languages.getlang_by_name('Scottish Gaelic; Gaelic')
    assert lang_obj is not None, 'Scottish Gaelic; Gaelic not found'
    assert lang_obj.code == "gd", 'Wrong internal repr. code'
    assert lang_obj.name == "Scottish Gaelic; Gaelic", 'Wrong name'
    assert lang_obj.native_name == "Gàidhlig", 'Wrong native_name'
Esempio n. 4
0
def get_khan_topic_tree(lang="en", update=False):
    """
    Build the complete topic tree based on the results obtained from the KA API.
    Note this topic tree contains a combined topic strcuture that includes all
    curriculum variants, curation pages, and child data may be in wrong order.
    Returns: tuple (root_node, topics_by_slug) for further processing according
    based on SLUG_BLACKLIST and TOPIC_TREE_REPLACMENTS specified in curation.py.
    """
    if lang == "sw":  # for backward compatibility in case old Swahili code used
        lang = "swa"

    # Get the fresh data from the KA API (do not try to re-use cached data)
    topic_tree = get_khan_api_json(lang, update=update)

    # if name of lang is passed in, get language code
    if getlang(lang) is None and getlang_by_name(lang):
        lang = getlang_by_name(lang).primary_code

    if lang not in SUPPORTED_LANGS:
        global translations
        translations = retrieve_translations(lang)

    # Flatten node_data (combine topics, videos, and exercises in a single list)
    flattened_tree = [node for node_list in topic_tree.values() for node in node_list]

    # Convert to dict with ids as keys (for fast lookups by id)
    tree_dict = {node["id"]: node for node in flattened_tree}

    # Build a lookup table {slug --> KhanTopic} to be used for replacement logic
    topics_by_slug = {}

    root_node = tree_dict["x00000000"]
    root = _recurse_create(root_node, tree_dict, topics_by_slug, lang=lang)

    return root, topics_by_slug
Esempio n. 5
0
def test_list_like_language_names():
    lang_obj = languages.getlang_by_name('Scottish Gaelic')
    assert lang_obj is not None, 'Scottish Gaelic; Gaelic not found'
    assert lang_obj.code == "gd", 'Wrong internal repr. code'
    assert lang_obj.name == "Scottish Gaelic; Gaelic", 'Wrong name'
    assert lang_obj.native_name == "Gàidhlig", 'Wrong native_name'

    lang_obj = languages.getlang_by_name('Gaelic')
    assert lang_obj is not None, 'Scottish Gaelic; Gaelic not found'
    assert lang_obj.code == "gd", 'Wrong internal repr. code'
    assert lang_obj.name == "Scottish Gaelic; Gaelic", 'Wrong name'
    assert lang_obj.native_name == "Gàidhlig", 'Wrong native_name'
Esempio n. 6
0
def test_list_like_language_names():
    lang_obj = languages.getlang_by_name("Scottish Gaelic")
    assert lang_obj is not None, "Scottish Gaelic; Gaelic not found"
    assert lang_obj.code == "gd", "Wrong internal repr. code"
    assert lang_obj.name == "Scottish Gaelic; Gaelic", "Wrong name"
    assert lang_obj.native_name == "Gàidhlig", "Wrong native_name"

    lang_obj = languages.getlang_by_name("Gaelic")
    assert lang_obj is not None, "Scottish Gaelic; Gaelic not found"
    assert lang_obj.code == "gd", "Wrong internal repr. code"
    assert lang_obj.name == "Scottish Gaelic; Gaelic", "Wrong name"
    assert lang_obj.native_name == "Gàidhlig", "Wrong native_name"
def test_list_like_language_names():
    lang_obj = languages.getlang_by_name('Scottish Gaelic')
    assert lang_obj is not None, 'Scottish Gaelic; Gaelic not found'
    assert lang_obj.code == "gd", 'Wrong internal repr. code'
    assert lang_obj.name == "Scottish Gaelic; Gaelic", 'Wrong name'
    assert lang_obj.native_name == "Gàidhlig", 'Wrong native_name'

    lang_obj = languages.getlang_by_name('Gaelic')
    assert lang_obj is not None, 'Scottish Gaelic; Gaelic not found'
    assert lang_obj.code == "gd", 'Wrong internal repr. code'
    assert lang_obj.name == "Scottish Gaelic; Gaelic", 'Wrong name'
    assert lang_obj.native_name == "Gàidhlig", 'Wrong native_name'
Esempio n. 8
0
def test_language_names_with_modifier_in_bracket():
    # try to match based on language name (stuff before subcode in brackets)
    lang_obj = languages.getlang_by_name('Swahili (macrolanguage)')
    assert lang_obj is not None, 'Swahili not found'
    assert lang_obj.code == "sw", 'Wrong internal repr. code'
    assert lang_obj.name == "Swahili", 'Wrong name'
    assert lang_obj.native_name == "Kiswahili", 'Wrong native_name'
    #
    lang_obj = languages.getlang_by_name('Sanskrit (Saṁskṛta)')
    assert lang_obj is not None, 'Sanskrit not found'
    assert lang_obj.code == "sa", 'Wrong internal repr. code'
    assert lang_obj.name == "Sanskrit (Saṁskṛta)", 'Wrong name'
    assert lang_obj.native_name == "संस्कृतम्", 'Wrong native_name'
def test_language_names_with_modifier_in_bracket():
    # try to match based on language name (stuff before subcode in brackets)
    lang_obj = languages.getlang_by_name('Swahili (macrolanguage)')
    assert lang_obj is not None, 'Swahili not found'
    assert lang_obj.code == "sw", 'Wrong internal repr. code'
    assert lang_obj.name == "Swahili", 'Wrong name'
    assert lang_obj.native_name == "Kiswahili", 'Wrong native_name'
    #
    lang_obj = languages.getlang_by_name('Sanskrit (Saṁskṛta)')
    assert lang_obj is not None, 'Sanskrit not found'
    assert lang_obj.code == "sa", 'Wrong internal repr. code'
    assert lang_obj.name == "Sanskrit (Saṁskṛta)", 'Wrong name'
    assert lang_obj.native_name == "संस्कृतम्", 'Wrong native_name'
Esempio n. 10
0
def test_language_names_with_modifier_in_bracket():
    # try to match based on language name (stuff before subcode in brackets)
    lang_obj = languages.getlang_by_name("Swahili (macrolanguage)")
    assert lang_obj is not None, "Swahili not found"
    # Not deterministic which Swahili code will be returned
    assert lang_obj.code == "sw" or lang_obj.code == "swa", "Wrong internal repr. code"
    assert lang_obj.name == "Swahili", "Wrong name"
    assert lang_obj.native_name == "Kiswahili", "Wrong native_name"
    #
    lang_obj = languages.getlang_by_name("Sanskrit (Saṁskṛta)")
    assert lang_obj is not None, "Sanskrit not found"
    assert lang_obj.code == "sa", "Wrong internal repr. code"
    assert lang_obj.name == "Sanskrit (Saṁskṛta)", "Wrong name"
    assert lang_obj.native_name == "संस्कृतम्", "Wrong native_name"
    def pre_run(self, args, options):
        if "lang" in options:
            language_code = options["lang"]
        else:
            language_code = (
                "en"
            )  # default to en if no language specified on command line

        lang = getlang(language_code) or getlang_by_name(language_code)

        channel_node = dict(
            source_id="KA ({0})".format(language_code),
            source_domain="khanacademy.org",
            title="Khan Academy ({0})".format(lang.native_name),
            description=CHANNEL_DESCRIPTION_LOOKUP.get(
                language_code, "Khan Academy content for {}.".format(lang.name)
            ),
            thumbnail=os.path.join("chefdata", "khan-academy-logo.png"),
            language=lang.code,
            children=[],
        )
        # build studio channel out of youtube playlist
        if options.get("youtube_channel_id"):
            youtube_id = options.get("youtube_channel_id")
            logger.info(
                "Downloading youtube playlist {} for {} language".format(
                    youtube_id, lang.name
                )
            )
            root_node = youtube_playlist_scraper(youtube_id, channel_node)
            # write to json file
            logger.info("writing ricecooker json to a file")
            json_tree_path = self.get_json_tree_path(*args, **options)
            write_tree_to_json_tree(json_tree_path, root_node)
            return

        logger.info("downloading KA tree")
        # build channel through KA API
        ka_root_topic = get_khan_topic_tree(lang=language_code)

        if options.get("english_subtitles"):
            # we will include english videos with target language subtitles
            duplicate_videos(ka_root_topic)

        language_code = lang.primary_code
        if lang.subcode:
            language_code = language_code + "-" + lang.subcode

        logger.info("converting KA nodes to ricecooker json nodes")
        root_topic = convert_ka_node_to_ricecooker_node(
            ka_root_topic, target_lang=language_code
        )

        for topic in root_topic["children"]:
            channel_node["children"].append(topic)

        # write to json file
        logger.info("writing ricecooker json to a file")
        json_tree_path = self.get_json_tree_path(*args, **options)
        write_tree_to_json_tree(json_tree_path, channel_node)
Esempio n. 12
0
    def test_invalid_format(self):
        expected_language = languages.getlang_by_name('English')

        converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'not.txt'))

        with self.assertRaises(InvalidSubtitleFormatError):
            converter.convert(expected_language.code)
Esempio n. 13
0
    def test_invalid_format__empty(self):
        expected_language = languages.getlang_by_name('English')

        converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'empty.ttml'))

        with self.assertRaises(InvalidSubtitleFormatError, msg='Caption file is empty'):
            converter.convert(expected_language.code)
Esempio n. 14
0
def fetch_all_languages(channel):
    doc = get_parsed_html_from_url('http://migranthealth.eu/etraining/')
    for link in doc.select('.category.essentialcats a'):
        url = link['href']
        language_name = link.text.strip()[len('MEET '):]
        language = languages.getlang_by_name(language_name)
        language_node = fetch_language(url, language)
        channel.add_child(language_node)
 def __get_language_code(self, language_str):
     language = getlang_by_name(language_str) or getlang_by_native_name(
         language_str)
     if language:
         return language.code
     else:
         print('Unknown language:', language_str)
         return NalibaliChef.ENGLISH_LANGUAGE_CODE
Esempio n. 16
0
    def test_invalid_language(self):
        expected_language = languages.getlang_by_name('Spanish')

        converter = build_subtitle_converter_from_file(
            os.path.join(test_files_dir, 'encapsulated.sami'))

        with self.assertRaises(InvalidSubtitleLanguageError):
            converter.convert(expected_language.code)
Esempio n. 17
0
    def test_not_expected_type(self):
        expected_format = file_formats.SCC
        expected_language = languages.getlang_by_name('Arabic')

        converter = build_subtitle_converter_from_file(
            os.path.join(test_files_dir, 'basic.srt'), in_format=expected_format)

        with self.assertRaises(InvalidSubtitleFormatError):
            converter.convert(expected_language.code)
Esempio n. 18
0
    def test_srt_conversion(self):
        expected_file = os.path.join(test_files_dir, 'basic.vtt')
        expected_language = languages.getlang_by_name('Arabic')

        converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'basic.srt'))
        converter.replace_unknown_language(expected_language.code)

        with tempfile.NamedTemporaryFile() as actual_file:
            converter.write(actual_file.name, expected_language.code)
            self.assertFileHashesEqual(expected_file, actual_file.name)
Esempio n. 19
0
    def test_replace_unknown_language(self):
        expected_language = languages.getlang_by_name('Arabic')

        converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'basic.srt'))

        self.assertTrue(converter.has_language(LANGUAGE_CODE_UNKNOWN))
        converter.replace_unknown_language(expected_language.code)

        self.assertTrue(converter.has_language(expected_language.code))
        self.assertFalse(converter.has_language(LANGUAGE_CODE_UNKNOWN))
Esempio n. 20
0
    def test_valid_language(self):
        expected_file = os.path.join(test_files_dir, 'encapsulated.vtt')
        expected_language = languages.getlang_by_name('English')

        converter = build_subtitle_converter_from_file(
            os.path.join(test_files_dir, 'encapsulated.sami'))
        self.assertTrue(converter.has_language(expected_language.code))

        with tempfile.NamedTemporaryFile() as actual_file:
            converter.write(actual_file.name, expected_language.code)
            self.assertFileHashesEqual(expected_file, actual_file.name)
def getlang_by_language_en(language_en):
    """
    Convert language names used on PraDigi websites to le_utils language object.
    """
    if language_en == 'Odiya' or language_en == 'Odisa':
        language_en = 'Oriya'
    elif language_en == 'Bangali':
        language_en = 'Bengali'
    elif language_en == 'Telagu':
        language_en = 'Telugu'
    lang_obj = getlang_by_name(language_en)
    return lang_obj
    def get_lang_obj(self):

        if self.name != "":
            lang_name = self.name
            language_obj = getlang_by_name(lang_name)

            if not language_obj:
                
                if UND_LANG[self.name]:
                    self.set_value(UND_LANG[self.name]["name"], UND_LANG[self.name]["code"], UND_LANG[self.name]["native_name"])
                    return True
            else:
                self.set_value(language_obj.name, language_obj.code, language_obj.native_name)
                return True
        return False
    def get_json_tree_path(self, *args, **kwargs):
        """
        Return path to ricecooker json tree file. Override this method to use
        a custom filename, e.g., for channel with multiple languages.
        """
        # Channel language
        if "lang" in kwargs:
            language_code = kwargs["lang"]
        else:
            language_code = (
                "en"
            )  # default to en if no language specified on command line

        lang_obj = getlang(language_code) or getlang_by_name(language_code)

        json_filename = self.RICECOOKER_JSON_TREE_TPL.format(lang_obj.code)
        json_tree_path = os.path.join(self.TREES_DATA_DIR, json_filename)
        return json_tree_path
def add_node_document(booklist, level_topic, as_booklist):
    """
    Add books as DocumentNode under a specific level of reading.
    Parameters:
    * booklist - The list of books to be added as DocumentNodes
    * level_topic - The TopicNode regarding current level that the DocumentNodes
                    will be attached to
    * as_booklist - The list of books from African Storybooks
    """
    for item in booklist:
        # Initialize the source domain and content_id
        domain = uuid.uuid5(uuid.NAMESPACE_DNS, "storyweaver.org.in")
        book_id = str(item["source_id"])

        # If the publisher is AS and the book is found,
        # then change the source_domain and content_id
        if item["publisher"] == "African Storybook Initiative":
            check = check_if_story_in_AS(as_booklist, item["title"])
            if check[0]:
                domain = uuid.uuid5(uuid.NAMESPACE_DNS,
                                    "www.africanstorybook.org")
                book_id = check[1]

        # Given that StoryWeaver provides the link to a zip file,
        # we will download the zip file and extract the pdf file from it
        with tempfile.NamedTemporaryFile(suffix=".zip") as tempf:
            try:
                resp = downloader.make_request(item["link"],
                                               clear_cookies=False)
                resp.raise_for_status()
                tempf.write(resp.content)
            except Exception as e:
                # Do not create the node if download fails
                LOGGER.info("Error: {} when downloading {}".format(
                    e, item["link"]))
                continue

            filename = ""
            with zipfile.ZipFile(tempf.name, "r") as f:
                for zipped_file in f.namelist():
                    if os.path.splitext(zipped_file)[1][1:] == "pdf":
                        tempdir = os.path.dirname(tempf.name)
                        f.extract(zipped_file, path=tempdir)
                        filename = os.path.join(tempdir, zipped_file)
                        break

        # If no pdf file has been found in the zip, do not create the node
        if not filename:
            continue

        # Create the document node with given information
        document_file = DocumentFile(path=filename)
        language_obj = getlang_by_name(item["language"])
        book = DocumentNode(
            title=item["title"],
            source_id=book_id,
            author=item["author"],
            provider=item["publisher"],
            files=[document_file],
            license=get_license(licenses.CC_BY,
                                copyright_holder="StoryWeaver"),
            thumbnail=item.get("thumbnail"),
            description=item["description"],
            domain_ns=domain,
            language=language_obj,
        )
        level_topic.add_child(book)
def build_lang_lookup_table(FEED_ROOT_URL):
    """
    Extracts all the root URLs of the languages, based on the links
    with face `Languages` in FEED_ROOT_URL.
    """
    OPDS_LANG_ROOTS = {}

    # Check for languages we don't yet support in Kolibri.
    langs_not_found = []

    feed = feedparser.parse(FEED_ROOT_URL)
    lang_links = []
    for link in feed.feed.links:
        if 'opds:facetgroup' in link:
            fg = link['opds:facetgroup']
            if fg == 'Languages':
                lang_links.append(link)

    # Build lookup table    lang_code --> dict with info about content in that langauge
    # where lang_code is the Learning Equality internal language codes defined in le_utils
    # Assume the chef scrill will be run on the command line using   lang=lang_code
    # E.g. lang_code for Zulu is `zul`, for Amharic it's `am`, and for Nepali it's `ne-NP`
    for link in lang_links:
        href = link['href']
        m = _LANG_CODE_RE.search(href)
        if not m:
            raise ValueError('Cannot find language code in href: ' + str(href))
        gdl_lang_code = m.groupdict()['gdl_lang_code']
        lang_title = link['title']
        if lang_title == "isiNdebele seSewula":
            lang_title = "isiNdebele"
        elif lang_title == 'বাঙালি':
            lang_title = 'বাংলা'

        print('Processig lang_title', lang_title)
        #
        # ATTEMPT 1 ##############
        lang_obj = getlang_by_name(lang_title)
        if not lang_obj:
            lang_obj = getlang_by_native_name(lang_title)
            #
            # ATTEMPT 2 #########
            if not lang_obj:
                pyc_lang = pycountry.languages.lookup(gdl_lang_code)
                code = pyc_lang.alpha_3
                if hasattr(pyc_lang, 'alpha_2'):
                    #
                    # ATTEMPT 3 ##############
                    code = pyc_lang.alpha_2

                # getlang_by_alpha2 is a misnomer, codes can be alpha2, alpha3, or lang+locale.
                lang_obj = getlang_by_alpha2(code)
                if not lang_obj:
                    langs_not_found.append((pyc_lang, lang_title))
                    print('ERROR could not find Kolibri lang info for ',
                          pyc_lang)
                    continue
        lang_code = lang_obj.code
        OPDS_LANG_ROOTS[lang_code] = dict(
            alpha_3=gdl_lang_code,
            lang_title=lang_title,
            href=href,
            name=lang_obj.name,
            native_name=lang_obj.native_name,
        )

    # For now, make missing languages a hard error so we can evaluate new language support case-by-case.
    if len(langs_not_found) > 0:
        lang_codes = []
        for pyc_lang, lang_title in langs_not_found:
            lang_codes.append(pyc_lang.alpha_3)
        message = "The following languages are not yet supported in Kolibri: {}".format(
            ",".join(lang_codes))
        assert len(langs_not_found) == 0, message

    return OPDS_LANG_ROOTS
Esempio n. 26
0
def test_unknown_name():
    lang_obj = languages.getlang_by_name('UnknoenLanguage')
    assert lang_obj is None, 'UnknoenLanguage name returned non-None'
class NalibaliChef(JsonTreeChef):

    #region Constants
    HOSTNAME = 'nalibali.org'
    ROOT_URL = f'http://{HOSTNAME}/story-library'
    DATA_DIR = 'chefdata'
    TREES_DATA_DIR = os.path.join(DATA_DIR, 'trees')
    CRAWLING_STAGE_OUTPUT = 'web_resource_tree.json'
    SCRAPING_STAGE_OUTPUT = 'ricecooker_json_tree.json'
    ZIP_FILES_TMP_DIR = os.path.join(DATA_DIR, 'zipfiles')
    LICENSE = get_license(licenses.CC_BY_NC_ND,
                          copyright_holder="Nal'ibali").as_dict()
    ENGLISH_LANGUAGE_CODE = getlang_by_name('English').code
    #endregion Constants

    #region Regexes
    STORY_PAGE_LINK_RE = compile(r'^.+page=(?P<page>\d+)$')
    SUPPORTED_THUMBNAIL_EXTENSIONS = compile(r'\.(png|jpg|jpeg)')
    AUTHOR_RE = compile(r'author:', IgnoreCase)
    AUDIO_STORIES_RE = compile(r'Audio Stories', IgnoreCase)
    AUDIO_STORY_ANCHOR_RE = compile(r'story-library/audio-stories')
    IONO_FM_RE = compile(r'iono.fm')
    RSS_FEED_RE = compile(r'/rss/chan')

    #endregion Regexes

    def __init__(self, html, logger):
        super(NalibaliChef, self).__init__(None, None)
        self._html = html
        self._logger = logger

    #region Helper functions
    def __absolute_url(self, url):
        if url.startswith("//"):
            return "https:" + url
        elif url.startswith("/"):
            return f'http://{NalibaliChef.HOSTNAME}{url}'
        return url

    def __get_text(self, elem):
        return "" if elem is None else elem.get_text().replace(
            '\r', '').replace('\n', ' ').strip()

    def __sanitize_author(self, text):
        if not text:
            return text
        new_text, _ = NalibaliChef.AUTHOR_RE.subn('', text)
        return new_text.strip()

    def __process_language(self, language):
        lang = language.lower()
        if lang == 'sotho':
            return 'Sesotho'
        elif lang == 'ndebele':
            return 'North Ndebele'
        elif lang == 'tsivenda':
            return 'Tshivenda'
        elif lang == 'seswati':
            return 'Siswati'
        elif lang == 'tsw':
            return 'Setswana'
        elif lang == 'continue reading':
            return 'English'
        return language

    def __get_language_code(self, language_str):
        language = getlang_by_name(language_str) or getlang_by_native_name(
            language_str)
        if language:
            return language.code
        else:
            print('Unknown language:', language_str)
            return NalibaliChef.ENGLISH_LANGUAGE_CODE

    #endregion Helper functions

    #region Crawling
    def crawl(self, args, options):
        root_page = self._html.get(NalibaliChef.ROOT_URL)
        story_hierarchies = self._crawl_story_hierarchies(root_page)
        web_resource_tree = dict(
            kind='NalibaliWebResourceTree',
            title="Nal'ibali Web Resource Tree",
            language='en',
            children=story_hierarchies,
        )
        json_file_name = os.path.join(NalibaliChef.TREES_DATA_DIR,
                                      NalibaliChef.CRAWLING_STAGE_OUTPUT)
        with open(json_file_name, 'w') as json_file:
            json.dump(web_resource_tree, json_file, indent=2)
            self._logger.info('Crawling results stored in ' + json_file_name)
        return story_hierarchies

    def _crawl_story_hierarchies(self, page):
        content_div = page.find('div', class_='region-content')
        vocabulary_div = content_div.find('div', class_='view-vocabulary')
        stories_divs = vocabulary_div.find_all('div', 'views-row')
        story_hierarchies = [
            h for h in map(self._crawl_to_story_hierarchy, stories_divs)
        ]
        stories_dict = dict(map(self._crawl_story_hierarchy,
                                story_hierarchies))
        for h in story_hierarchies:
            stories = stories_dict.get(h['url'], {})
            h['children'] = stories
        return story_hierarchies

    def _crawl_to_story_hierarchy(self, div):
        title = self.__get_text(div.find('h2'))
        image_url = div.find('img', class_='img-responsive')['src']
        body_text = self.__get_text(div.find('div', class_='body'))
        stories_url = self.__absolute_url(
            div.find('div',
                     class_='views-field').find('a',
                                                class_='btn link')['href'])
        return dict(
            kind='NalibaliHierarchy',
            title=title,
            thumbnail=image_url,
            description=body_text,
            url=stories_url,
        )

    def _crawl_story_hierarchy(self, hierarchy):
        if NalibaliChef.AUDIO_STORIES_RE.search(hierarchy['title']):
            return self._crawl_audio_stories_hierarchy(hierarchy)

        stories_url = hierarchy['url']
        paginations = self._crawl_pagination(stories_url)
        paginations.insert(
            0,
            dict(
                kind='NalibaliPagination',
                url=stories_url,
                page=0,
                name='1',
            ))
        all_stories_by_bucket = list(
            map(self._crawl_pagination_stories, paginations))
        stories_by_language = {}
        for stories_bucket in all_stories_by_bucket:
            for story in stories_bucket:
                for lang, story in story['supported_languages'].items():
                    by_language = stories_by_language.get(lang)
                    if not by_language:
                        by_language = (set(), [])
                        stories_by_language[lang] = by_language
                    uniques, stories = by_language
                    url = story['url']
                    if url not in uniques:
                        stories.append(story)
                    uniques.add(url)
        for lang, (uniques, stories) in stories_by_language.items():
            stories_by_language[lang] = stories
        return stories_url, stories_by_language

    def _crawl_pagination(self, url):
        page = self._html.get(url)
        pagination_ul = page.find('ul', class_='pagination')

        if not pagination_ul:
            return []

        anchors = pagination_ul.find_all(
            'a', attrs={'href': NalibaliChef.STORY_PAGE_LINK_RE})
        paginations = list(map(self._crawl_to_pagination, anchors))
        paginations_dict = {p['name']: p for p in paginations}
        actual_paginations = [
            p for p in paginations
            if ('next' not in p['name'] and 'last' not in p['name'] and 'first'
                not in p['name'] and 'previous' not in p['name'] and '>' not in
                p['name'] and '‹' not in p['name'] and p['name'] != '')
        ]
        last = paginations_dict.get('last')

        # This is to handle Story Cards hierarchy, since it does not have a <<last>> pagination item
        if not last:
            return actual_paginations

        current_last = actual_paginations[-1]
        if current_last['page'] == last['page']:
            return actual_paginations
        else:
            seen = set()
            return [
                x for x in actual_paginations +
                self._crawl_pagination(current_last['url'])
                if x['page'] not in seen and not seen.add(x['page'])
            ]

    def _crawl_to_pagination(self, anchor):
        href = anchor['href']
        m = NalibaliChef.STORY_PAGE_LINK_RE.match(href)
        if not m:
            raise Exception('STORY_PAGE_LINK_RE could not match')
        groups = m.groupdict()
        text = self.__get_text(anchor)
        parts = text.split()
        name = parts[0] if len(parts) > 0 else text
        return dict(
            kind='NalibaliPagination',
            url=self.__absolute_url(href),
            page=groups['page'],
            name=name,
        )

    def _crawl_pagination_stories(self, pagination):
        url = pagination['url']
        page = self._html.get(url)
        content_views = page.find_all('div', class_='view-content')
        stories = []
        for content in content_views:
            stories.extend([
                story
                for story in map(self._crawl_to_story,
                                 content.find_all('div', class_='views-row'))
                if story
            ])
        return stories

    def _crawl_to_story(self, div):
        title_elem = div.find('span', property='dc:title')
        title = ''
        if title_elem:
            title = title_elem['content']
        else:
            title_elem = div.find('div', class_='content')
            if not title_elem:
                return None
            title = self.__get_text(title_elem.find('h3'))

        if not title:
            return None

        posted_date = self.__get_text(div.find('div', class_='field-date'))
        author = self.__sanitize_author(
            self.__get_text(div.find('div', class_='field-author')))
        description = self.__get_text(div.find('div', class_='field-body'))
        links = div.find('div', class_='links')
        anchors = links.find_all('a') if links else []
        image = div.find('img', class_='img-responsive') or div.find('img')
        image_src = image['src'] if image else ''
        thumbnail = image_src.split(
            '?')[0] if NalibaliChef.SUPPORTED_THUMBNAIL_EXTENSIONS.search(
                image_src) else None
        language_and_hrefs = [
            (self.__process_language(self.__get_text(anchor)), anchor['href'])
            for anchor in anchors
        ]
        story_by_language = {
            language: dict(
                kind='NalibaliLocalizedStory',
                title=title,
                description=description,
                posted_date=posted_date,
                author=author,
                language=language,
                url=self.__absolute_url(href),
                thumbnail=thumbnail,
            )
            for language, href in language_and_hrefs
        }
        return dict(
            kind='NalibaliStory',
            title=title,
            posted_date=posted_date,
            author=author,
            supported_languages=story_by_language,
        )

    def _crawl_audio_stories_hierarchy(self, hierarchy):
        stories_url = hierarchy['url']
        page = self._html.get(stories_url)
        content = page.find('section',
                            id='section-main').find('div',
                                                    class_='region-content')
        language_info = [
            (self.__process_language(self.__get_text(anchor)), anchor['href'])
            for anchor in content.find_all(
                'a', attrs={'href': NalibaliChef.AUDIO_STORY_ANCHOR_RE})
            if not anchor.get('class') and len(self.__get_text(anchor)) > 2
        ]
        stories_by_language = {}

        for lang, url in language_info:
            language_page = self._html.get(self.__absolute_url(url))
            language_iono_fm_url = language_page.find(
                'a', attrs={'href': NalibaliChef.IONO_FM_RE})['href']
            language_iono_fm_page = self._html.get(language_iono_fm_url)
            rss_url = language_iono_fm_page.find(
                'link', attrs={'href': NalibaliChef.RSS_FEED_RE})['href']
            rss_page = self._html.get_xml(rss_url)
            items = rss_page.find_all('item')
            stories = [None] * len(items)

            for i, item in enumerate(items):
                url = item.enclosure['url'].split('?')[0]
                filename = os.path.basename(url)
                filename_posix = PurePosixPath(filename)
                filename_no_extension = filename_posix.stem
                mp3_url = os.path.join(os.path.dirname(url),
                                       filename_no_extension) + '.mp3'
                mp3_version_exists = self._html.head(
                    mp3_url).status_code == 200
                if not mp3_version_exists:
                    raise Exception(f'No mp3 version available for {url}')
                audio_node_url = mp3_url if mp3_version_exists else url
                parsed_url = urlparse(audio_node_url)

                stories[i] = dict(
                    title=self.__get_text(item.title),
                    source_id=parsed_url.path,
                    url=audio_node_url,
                    content_type=item.enclosure['type'],
                    description=self.__get_text(item.summary),
                    pub_date=self.__get_text(item.pubDate),
                    author=self.__get_text(item.author),
                    language=lang,
                    thumbnail=item.thumbnail['href'],
                )
            stories_by_language[lang] = stories
        return stories_url, stories_by_language

    #endregion Crawling

    #region Scraping
    def scrape(self, args, options):
        kwargs = {}  # combined dictionary of argparse args and extra options
        kwargs.update(args)
        kwargs.update(options)

        with open(
                os.path.join(NalibaliChef.TREES_DATA_DIR,
                             NalibaliChef.CRAWLING_STAGE_OUTPUT),
                'r') as json_file:
            web_resource_tree = json.load(json_file)
            assert web_resource_tree['kind'] == 'NalibaliWebResourceTree'

        ricecooker_json_tree = dict(
            source_domain=NalibaliChef.HOSTNAME,
            source_id="nal'ibali",
            title=web_resource_tree['title'],
            description=
            """Nal'ibali (isiXhosa for "here's the story") is a national reading-for-enjoyment campaign to spark children's potential through storytelling and reading.""",
            language='en',
            thumbnail=
            'http://nalibali.org/sites/default/files/nalibali_logo.png',
            children=[],
        )
        hierarchies_map = {
            h['title']: h
            for h in web_resource_tree['children']
        }
        children = [None] * len(hierarchies_map.keys())
        children[0] = self._scrape_hierarchy(
            hierarchies_map.get('Multilingual stories'),
            self._scrape_multilingual_story)
        children[1] = self._scrape_hierarchy(
            hierarchies_map.get('Audio stories'), self._scrape_audio_story)
        children[2] = self._scrape_hierarchy(
            hierarchies_map.get('Story cards'), self._scrape_story_card)
        children[3] = self._scrape_hierarchy(
            hierarchies_map.get('Story seeds'), self._scrape_story_seed)
        children[4] = self._scrape_hierarchy(
            hierarchies_map.get('Your stories'), self._scrape_your_story)
        ricecooker_json_tree['children'] = children
        write_tree_to_json_tree(
            os.path.join(NalibaliChef.TREES_DATA_DIR,
                         NalibaliChef.SCRAPING_STAGE_OUTPUT),
            ricecooker_json_tree)
        return ricecooker_json_tree

    def _scrape_hierarchy(self, hierarchy, story_scraping_func):
        assert hierarchy['kind'] == 'NalibaliHierarchy'
        items = hierarchy.get('children', {}).items()
        hierarchy_name = hierarchy['title'].replace(' ', '_')
        hierarchy_by_language = [None] * len(items)
        for i, (language, stories) in enumerate(items):
            stories_nodes = [
                story for story in map(story_scraping_func, stories) if story
            ]
            topic_node = dict(
                kind=content_kinds.TOPIC,
                source_id=f'{hierarchy_name}_{language}',
                title=language,
                description=f'Stories in {language}',
                children=stories_nodes,
            )
            hierarchy_by_language[i] = topic_node
        hierarchy_title = hierarchy['title']
        return dict(
            kind=content_kinds.TOPIC,
            source_id=hierarchy_title,
            title=hierarchy_title,
            description=hierarchy['description'],
            children=hierarchy_by_language,
            thumbnail=hierarchy['thumbnail'],
        )

    def _scrape_multilingual_story(self, story):
        return self._scrape_story_html5(story)

    def _scrape_audio_story(self, story):
        return dict(kind=content_kinds.AUDIO,
                    source_id=story['source_id'],
                    title=story['title'],
                    license=NalibaliChef.LICENSE,
                    author=story['author'],
                    description=story['description'],
                    domain_ns=NalibaliChef.HOSTNAME,
                    thumbnail=story['thumbnail'],
                    files=[
                        dict(
                            file_type=content_kinds.AUDIO,
                            path=story['url'],
                            language=self.__get_language_code(
                                story['language']),
                        )
                    ])

    def _scrape_story_card(self, story):
        url = story['url']
        language_str = story['language']
        lang_code = self.__get_language_code(language_str)

        if url and url.endswith('.pdf'):
            parsed_url = urlparse(url)
            return dict(
                source_id=parsed_url.path,
                kind=content_kinds.DOCUMENT,
                title=story['title'],
                description=story['description'],
                license=NalibaliChef.LICENSE,
                author=story['author'],
                thumbnail=story['thumbnail'],
                language=lang_code,
                files=[dict(
                    file_type=content_kinds.DOCUMENT,
                    path=url,
                )])
        raise Exception('Non-PDF version not implemented')

    def _scrape_story_seed(self, story):
        return self._scrape_story_html5(story)

    def _scrape_your_story(self, story):
        return self._scrape_story_html5(story)

    def _scrape_download_image(self, base_path, img):
        url = img['src']

        if not url:
            return

        if url.startswith('http') or url.startswith('https'):
            absolute_url = url
            parsed_url = urlparse(url)
            relative_url = parsed_url.path
        else:
            absolute_url = self.__absolute_url(url)
            relative_url = url

        self._scrape_download_image_helper(base_path, img, absolute_url,
                                           relative_url)

    def _scrape_download_image_helper(self, base_path, img, absolute_url,
                                      relative_url):
        image_response = self._html.get_image(absolute_url)
        if image_response.status_code != 200:
            return
        filename = os.path.basename(relative_url)
        subdirs = os.path.dirname(relative_url).split('/')
        image_dir = os.path.join(base_path, *subdirs)
        pathlib.Path(image_dir).mkdir(parents=True, exist_ok=True)
        image_path = os.path.join(image_dir, filename)
        with open(image_path, 'wb') as f:
            image_response.raw.decode_content = True
            shutil.copyfileobj(image_response.raw, f)
        img['src'] = relative_url[1:] if relative_url[
            0] == '/' else relative_url

    def _scrape_story_html5(self, story):
        url = story['url']
        page = self._html.get(url)
        story_section = page.find('section', id='section-main')
        links_section = story_section.find('div', class_='languages-links')

        # Is there a way to cross link HTML5AppNode?
        if links_section:
            links_section.extract()

        title = self.__get_text(story_section.find('h1', class_='page-header'))
        language_code = self.__get_language_code(story['language'])
        dest_path = tempfile.mkdtemp(dir=NalibaliChef.ZIP_FILES_TMP_DIR)

        for img in story_section.find_all('img'):
            self._scrape_download_image(dest_path, img)

        basic_page_str = """
        <!DOCTYPE html>
        <html>
          <head>
            <meta charset="utf-8">
            <title></title>
          </head>
          <body>
          </body>
        </html>"""
        basic_page = BeautifulSoup(basic_page_str, "html.parser")
        body = basic_page.find('body')
        body.append(story_section)
        with open(os.path.join(dest_path, 'index.html'), 'w',
                  encoding="utf8") as index_html:
            index_html.write(str(basic_page))
        zip_path = create_predictable_zip(dest_path)
        parsed_story_url = urlparse(url)
        return dict(
            kind=content_kinds.HTML5,
            source_id=parsed_story_url.path if parsed_story_url else url,
            title=title,
            language=language_code,
            description=story['description'],
            license=NalibaliChef.LICENSE,
            thumbnail=story['thumbnail'],
            files=[
                dict(
                    file_type=content_kinds.HTML5,
                    path=zip_path,
                    language=language_code,
                )
            ],
        )

    #endregion Scraping

    def pre_run(self, args, options):
        self.crawl(args, options)
        self.scrape(args, options)
Esempio n. 28
0
def test_language_names_with_modifier_after_comma():
    # try to match based on language name (stuff before the comma)
    lang_obj = languages.getlang_by_name('Arabic, Tunisian')
    assert lang_obj is not None, 'Arabic fallback not found'
    assert lang_obj.code == "ar", 'Wrong internal repr. code'
    assert lang_obj.name == "Arabic", 'Wrong name'
Esempio n. 29
0
    def upload_content(self, data, access_token, channel):
        for language, language_value in data.items():
            # convert to title to apply title case for node titles
            language = language.title()
            language_node = nodes.TopicNode(title=language,
                                            source_id=language,
                                            author="TicTacLearn",
                                            description='',
                                            thumbnail=TTL_MAIN_LOGO,
                                            language=getlang_by_name(language))
            for grade, grade_value in language_value.items():
                grade_node = nodes.TopicNode(
                    title='Grade {}'.format(grade),
                    source_id="{}-{}".format(language, grade),
                    author="TicTacLearn",
                    description='',
                    thumbnail=TTL_MAIN_LOGO,
                    language=getlang_by_name(language))

                for subject, subject_value in grade_value.items():
                    subject = subject.title()
                    subject_node = nodes.TopicNode(
                        title=subject,
                        source_id="{}-{}-{}".format(language, grade, subject),
                        author="TicTacLearn",
                        description='',
                        thumbnail=TTL_MAIN_LOGO,
                        language=getlang_by_name(language))
                    for chapter, chapter_value in subject_value.items():
                        chapter = chapter.title()
                        chapter_node = nodes.TopicNode(
                            title=chapter,
                            source_id="{}-{}-{}-{}".format(
                                language, grade, subject, chapter),
                            author="TicTacLearn",
                            description='',
                            thumbnail=TTL_MAIN_LOGO,
                            language=getlang_by_name(language))
                        for topic, topic_value in chapter_value.items():
                            topic = topic.title()
                            if topic == "Chapter Assessment":
                                questions = self.create_question(
                                    topic_value.items())
                                exercise_node = nodes.ExerciseNode(
                                    source_id="{}-{}-{}-{}-{}".format(
                                        language, grade, subject, chapter,
                                        topic),
                                    title=topic,
                                    author="TicTacLearn",
                                    description="Chapter Assessment",
                                    language=getlang_by_name(language),
                                    license=licenses.CC_BYLicense(
                                        "TicTacLearn"),
                                    thumbnail=TTL_MAIN_LOGO,
                                    exercise_data={
                                        "mastery_model": exercises.M_OF_N,
                                        "m": len(questions),
                                        "n": len(questions),
                                        "randomize": True
                                    },
                                    questions=questions)
                                chapter_node.add_child(exercise_node)
                            else:
                                topic_node = nodes.TopicNode(
                                    title=topic,
                                    source_id="{}-{}-{}-{}-{}".format(
                                        language, grade, subject, chapter,
                                        topic),
                                    author="TicTacLearn",
                                    description='',
                                    thumbnail=TTL_MAIN_LOGO,
                                    language=getlang_by_name(language))
                                for content_type, content in topic_value.items(
                                ):
                                    if content_type == "video":
                                        for link, details in content.items():
                                            try:
                                                video_node = self.video_node_from_dropbox(
                                                    details, link,
                                                    access_token)
                                                topic_node.add_child(
                                                    video_node)
                                            except Exception as e:
                                                print(e)
                                                print(
                                                    "Error getting video from dropbox with link: {}"
                                                    .format(link))
                                                self.add_to_failed(
                                                    link, details,
                                                    content_type)
                                                continue
                                    else:
                                        # content type is assessment
                                        questions = self.create_question(
                                            content.items())
                                        exercise_node = nodes.ExerciseNode(
                                            source_id=
                                            "{}-{}-{}-{}-{}-Assessment".format(
                                                language, grade, subject,
                                                chapter, topic),
                                            title="{} Assessment".format(
                                                topic),
                                            author="TicTacLearn",
                                            description="{} Assessment".format(
                                                topic),
                                            license=licenses.CC_BYLicense(
                                                "TicTacLearn"),
                                            thumbnail=TTL_MAIN_LOGO,
                                            exercise_data={
                                                "mastery_model":
                                                exercises.M_OF_N,
                                                "m": len(questions),
                                                "n": len(questions),
                                                "randomize": True
                                            },
                                            questions=questions)
                                        topic_node.add_child(exercise_node)

                                chapter_node.add_child(topic_node)
                        subject_node.add_child(chapter_node)
                    grade_node.add_child(subject_node)
                language_node.add_child(grade_node)
            channel.add_child(language_node)

        return channel
Esempio n. 30
0
lang_tag_to_lang_dict = {}

for le_code, lang_obj in _LANGLOOKUP.items():
    lang_tag_to_lang_dict[le_code] = dict(
        lang_tag=le_code,
        name=lang_obj.name.split(';')[0],
        le_code=le_code,        
    )


for item in LANGS_LOOKUP:
    lang_tag = item['lang']
    name = repr(item['pattern']).replace("re.compile('.*", "").replace(".*')", "")
    lang_obj = getlang(lang_tag)
    if lang_obj is None:
        lang_obj = getlang_by_name(name)

    lang_tag_to_lang_dict[lang_tag] = dict(
        lang_tag=lang_tag,
        name=name,
        le_code=lang_obj.code if lang_obj else None,        
    )
    


# Export as JSON
################################################################################

projects_tree = {}

for project_name, project_langs in projects.items():
Esempio n. 31
0
def test_unknown_name():
    lang_obj = languages.getlang_by_name('UnknoenLanguage')
    assert lang_obj is None, 'UnknoenLanguage name returned non-None'
Esempio n. 32
0
def test_language_names_with_modifier_after_comma():
    # try to match based on language name (stuff before the comma)
    lang_obj = languages.getlang_by_name('Arabic, Tunisian')
    assert lang_obj is not None, 'Arabic fallback not found'
    assert lang_obj.code == "ar", 'Wrong internal repr. code'
    assert lang_obj.name == "Arabic", 'Wrong name'